Ejemplo n.º 1
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container in stats:
        inspectInfo = docker_inspect.inspect(container)
        if inspectInfo is None or not inspectInfo["labels"]:
            continue

        gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
        otherLabels.update(inspectInfo["env"])

        for id in gpuIds:
            if gpuInfos:
                logger.info(gpuInfos)
                labels = copy.deepcopy(otherLabels)
                labels["minor_number"] = id

                result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

        result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"]))
        result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"]))
        result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"]))
        result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"]))
        result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"]))
        result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"]))
        result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"]))
        result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"]))

    return result
Ejemplo n.º 2
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:] # remove "k8s_" prefix
                break

        if pai_service_name is None:
            inspectInfo = docker_inspect.inspect(container_id)
            if inspectInfo is None or not inspectInfo["labels"]:
                continue

            gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
            otherLabels.update(inspectInfo["env"])

            for id in gpuIds:
                if gpuInfos:
                    logger.info(gpuInfos)
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                    result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

            result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"]))
            result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"]))
            result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"]))
            result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"]))
            result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"]))
            result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"]))
            result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"]))
            result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"]))

    return result
Ejemplo n.º 3
0
def genJobMetrics(logDir, gpuMetrics):
    stats = docker_stats.stats()
    outputFile = open(logDir + "/job_exporter.prom", "w")
    for container in stats:
        inspectInfo = docker_inspect.inspect(container)
        if not inspectInfo["labels"]:
            continue
        gpuIds, labelStr = parseFromLabels(inspectInfo["labels"])
        envStr = parseFromEnv(inspectInfo["env"])
        labelStr = labelStr + envStr
        for id in gpuIds:
            print("gpu id")
            print(id)
            if gpuMetrics:
                print(gpuMetrics)
                containerGpuUtilStr = 'container_GPUPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format(
                    labelStr, id, gpuMetrics[id]["gpuUtil"])
                containerMemUtilStr = 'container_GPUMemPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format(
                    labelStr, id, gpuMetrics[id]["gpuMemUtil"])
                outputFile.write(containerGpuUtilStr)
                outputFile.write(containerMemUtilStr)

        containerCPUPerc = 'container_CPUPerc{{{0}}} {1}\n'.format(
            labelStr, stats[container]["CPUPerc"])
        containerMemUsage = 'container_MemUsage{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemUsage_Limit"]["usage"])
        containerMemLimit = 'container_MemLimit{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemUsage_Limit"]["limit"])
        containerNetIn = 'container_NetIn{{{0}}} {1}\n'.format(
            labelStr, stats[container]["NetIO"]["in"])
        containerNetOut = 'container_NetOut{{{0}}} {1}\n'.format(
            labelStr, stats[container]["NetIO"]["out"])
        containerBlockIn = 'container_BlockIn{{{0}}} {1}\n'.format(
            labelStr, stats[container]["BlockIO"]["in"])
        containerBlockOut = 'container_BlockOut{{{0}}} {1}\n'.format(
            labelStr, stats[container]["BlockIO"]["out"])
        containerMemPerc = 'container_MemPerc{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemPerc"])
        outputFile.write(containerCPUPerc)
        outputFile.write(containerMemUsage)
        outputFile.write(containerMemLimit)
        outputFile.write(containerNetIn)
        outputFile.write(containerNetOut)
        outputFile.write(containerBlockIn)
        outputFile.write(containerBlockOut)
        outputFile.write(containerMemPerc)
Ejemplo n.º 4
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout, self.gpu_vendor)

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host
        # network, and network statistic from docker is not specific to that
        # container. We have to get network statistic by ourselves.
        lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram,
                                   ContainerCollector.lsof_timeout)

        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.exec_cmd(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info.strip(), lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])
Ejemplo n.º 5
0
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies):
    stats_obj = docker_stats.stats()
    if stats_obj is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats_obj.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:]  # remove "k8s_" prefix
                break

        inspect_info = docker_inspect.inspect(container_id)
        pid = inspect_info["pid"] if inspect_info is not None else None
        inspect_labels = utils.walk_json_field_safe(inspect_info, "labels")

        if not inspect_labels and pai_service_name is None:
            continue  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host network,
        # network statistic from docker is not specific to that container. We have to
        # get network statistic by ourselves.
        lsof_result = network.lsof(pid)
        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.check_output(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info, lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpuIds, otherLabels = parse_from_labels(inspect_info["labels"])
            otherLabels.update(inspect_info["env"])

            for id in gpuIds:
                if gpu_infos:
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(
                        Metric("container_GPUPerc", labels,
                               gpu_infos[id]["gpuUtil"]))
                    result.append(
                        Metric("container_GPUMemPerc", labels,
                               gpu_infos[id]["gpuMemUtil"]))

            result.append(
                Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(
                Metric("container_MemUsage", otherLabels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("container_MemLimit", otherLabels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, net_in))
            result.append(Metric("container_NetOut", otherLabels, net_out))
            result.append(
                Metric("container_BlockIn", otherLabels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("container_BlockOut", otherLabels,
                       stats["BlockIO"]["out"]))
            result.append(
                Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(
                Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(
                Metric("service_mem_usage_byte", labels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("service_mem_limit_byte", labels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(
                Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, net_in))
            result.append(Metric("service_net_out_byte", labels, net_out))
            result.append(
                Metric("service_block_in_byte", labels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("service_block_out_byte", labels,
                       stats["BlockIO"]["out"]))

    result.extend(
        generate_zombie_count(stats_obj, type1_zombies, type2_zombies))

    return result
Ejemplo n.º 6
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(container_name)

        inspect_info = docker_inspect.inspect(container_id,
                ContainerCollector.inspect_histogram,
                ContainerCollector.inspect_timeout)

        if inspect_info is None:
            return

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return # other container, maybe kubelet or api-server

        # get network consumption, if container is host network, we will treat
        # node network consumption as container consumption. If not, use data
        # from docker state.
        # This will result network consumption of service using host network
        # equals to node network consumption.
        is_host_network = inspect_info.is_host_network
        if is_host_network:
            net_in, net_out = network.get_network_consumption(
                self.network_interface)
        else:
            net_in, net_out = network.get_non_host_network_consumption(pid)

        if pai_service_name is None:
            gpu_ids,npu_ids,container_labels = ContainerCollector.parse_from_labels(inspect_info, gpu_infos)
            logger.info("start to collect metric for jobId: %s",container_labels["job_name"])
            if container_labels["username"] == "unknown":
                logger.warn("jobId: %s has none username,pass!" %(container_labels["job_name"]))
                return
            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    uuid = nvidia_gpu_status.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["device_type"] = inspect_info.gpu_type or "unknown"
                    labels["uuid"] = uuid
                    labels["device_str"] = "nvidia.com/gpu"

                    gauges.add_value("task_device_percent",
                            labels, nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_device_mem_percent",
                            labels, nvidia_gpu_status.gpu_mem_util)

            if npu_infos:
                for id in npu_ids:
                    if npu_infos.get(id) is None:
                        continue

                    npu_status = npu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["device_type"] = inspect_info.gpu_type or "unknown"
                    labels["device_str"] = "npu.huawei.com/NPU"
                    ### each npu device should have one unique string
                    labels["uuid"] = id
                    if inspect_info.node_name:
                        labels["uuid"] =inspect_info.node_name+ "_" + str(id)

                    gauges.add_value("task_device_percent",
                            labels, npu_status.npu_util)
                    gauges.add_value("task_device_mem_percent",
                            labels, npu_status.npu_mem_util)
            if dcgm_infos:
                for id in gpu_ids:
                    if dcgm_infos.get(id) is None:
                        continue
                    dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics
                    uuid = dcgm_metric.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid
                    gauges.add_dcgm_metric(dcgm_metric, labels)

            gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
Ejemplo n.º 7
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges, dcgm_infos, infiniband_infos,
                              ipoib_infos):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout)

        if inspect_info is None:
            logger.debug("ignore killed container %s", container_id)
            return

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, if container is host network, we will treat
        # node network consumption as container consumption. If not, use data
        # from docker state.
        # This will result network consumption of service using host network
        # equals to node network consumption.
        is_host_network = inspect_info.is_host_network
        if is_host_network:
            net_in, net_out = network.get_network_consumption(
                self.network_interface)
        else:
            net_in, net_out = network.get_non_host_network_consumption(pid)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    uuid = nvidia_gpu_status.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            if dcgm_infos:
                for id in gpu_ids:
                    if dcgm_infos.get(id) is None:
                        continue
                    dcgm_metric = dcgm_infos[id]  # will be type of DCGMMetrics
                    uuid = dcgm_metric.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid
                    gauges.add_dcgm_metric(dcgm_metric, labels)

            if is_host_network:
                if infiniband_infos:
                    for infiniband_info in infiniband_infos:
                        labels = copy.deepcopy(container_labels)
                        labels.update(infiniband_info.labels)
                        gauges.add_value("task_infiniband_receive_bytes_total",
                                         labels, infiniband_info.receive_bytes)
                        gauges.add_value(
                            "task_infiniband_transmit_bytes_total", labels,
                            infiniband_info.transmit_bytes)
                if ipoib_infos:
                    for ipoib_info in ipoib_infos:
                        labels = copy.deepcopy(container_labels)
                        labels.update(ipoib_info.labels)
                        gauges.add_value("task_ipoib_receive_bytes_total",
                                         labels, ipoib_info.receive_bytes)
                        gauges.add_value("task_ipoib_transmit_bytes_total",
                                         labels, ipoib_info.transmit_bytes)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])