Exemple #1
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout, self.gpu_vendor)

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host
        # network, and network statistic from docker is not specific to that
        # container. We have to get network statistic by ourselves.
        lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram,
                                   ContainerCollector.lsof_timeout)

        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.exec_cmd(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info.strip(), lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])
Exemple #2
0
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies):
    stats_obj = docker_stats.stats()
    if stats_obj is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats_obj.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:]  # remove "k8s_" prefix
                break

        inspect_info = docker_inspect.inspect(container_id)
        pid = inspect_info["pid"] if inspect_info is not None else None
        inspect_labels = utils.walk_json_field_safe(inspect_info, "labels")

        if not inspect_labels and pai_service_name is None:
            continue  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host network,
        # network statistic from docker is not specific to that container. We have to
        # get network statistic by ourselves.
        lsof_result = network.lsof(pid)
        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.check_output(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info, lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpuIds, otherLabels = parse_from_labels(inspect_info["labels"])
            otherLabels.update(inspect_info["env"])

            for id in gpuIds:
                if gpu_infos:
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(
                        Metric("container_GPUPerc", labels,
                               gpu_infos[id]["gpuUtil"]))
                    result.append(
                        Metric("container_GPUMemPerc", labels,
                               gpu_infos[id]["gpuMemUtil"]))

            result.append(
                Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(
                Metric("container_MemUsage", otherLabels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("container_MemLimit", otherLabels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, net_in))
            result.append(Metric("container_NetOut", otherLabels, net_out))
            result.append(
                Metric("container_BlockIn", otherLabels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("container_BlockOut", otherLabels,
                       stats["BlockIO"]["out"]))
            result.append(
                Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(
                Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(
                Metric("service_mem_usage_byte", labels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("service_mem_limit_byte", labels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(
                Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, net_in))
            result.append(Metric("service_net_out_byte", labels, net_out))
            result.append(
                Metric("service_block_in_byte", labels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("service_block_out_byte", labels,
                       stats["BlockIO"]["out"]))

    result.extend(
        generate_zombie_count(stats_obj, type1_zombies, type2_zombies))

    return result