Exemple #1
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container in stats:
        inspectInfo = docker_inspect.inspect(container)
        if inspectInfo is None or not inspectInfo["labels"]:
            continue

        gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
        otherLabels.update(inspectInfo["env"])

        for id in gpuIds:
            if gpuInfos:
                logger.info(gpuInfos)
                labels = copy.deepcopy(otherLabels)
                labels["minor_number"] = id

                result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

        result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"]))
        result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"]))
        result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"]))
        result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"]))
        result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"]))
        result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"]))
        result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"]))
        result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"]))

    return result
    def collect_impl(self):
        all_conns = network.iftop(self.network_interface,
                                  ContainerCollector.iftop_histogram,
                                  ContainerCollector.iftop_timeout)

        stats_obj = docker_stats.stats(ContainerCollector.stats_histogram,
                                       ContainerCollector.stats_timeout)

        now = datetime.datetime.now()
        gpu_infos = self.gpu_info_ref.get(now)
        self.stats_info_ref.set(stats_obj, now)
        dcgm_infos = self.dcgm_info_ref.get(now)
        infiniband_infos = self.infiniband_info_ref.get(now)
        ipoib_infos = self.ipoib_info_ref.get(now)

        logger.debug("all_conns is %s", all_conns)
        logger.debug("gpu_info is %s", gpu_infos)
        logger.debug("stats_obj is %s", stats_obj)
        logger.debug("dcgm_infos is %s", dcgm_infos)
        logger.debug("infiniband_infos is %s", infiniband_infos)
        logger.debug("ipoib_infos is %s", ipoib_infos)

        return self.collect_container_metrics(stats_obj, gpu_infos, all_conns,
                                              dcgm_infos, infiniband_infos,
                                              ipoib_infos)
Exemple #3
0
def collect_job_metrics(gpuInfos):
    stats = docker_stats.stats()
    if stats is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:] # remove "k8s_" prefix
                break

        if pai_service_name is None:
            inspectInfo = docker_inspect.inspect(container_id)
            if inspectInfo is None or not inspectInfo["labels"]:
                continue

            gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"])
            otherLabels.update(inspectInfo["env"])

            for id in gpuIds:
                if gpuInfos:
                    logger.info(gpuInfos)
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"]))
                    result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"]))

            result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"]))
            result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"]))
            result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"]))
            result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"]))
            result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]))
            result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"]))
            result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"]))
            result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"]))
            result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"]))

    return result
Exemple #4
0
def genJobMetrics(logDir, gpuMetrics):
    stats = docker_stats.stats()
    outputFile = open(logDir + "/job_exporter.prom", "w")
    for container in stats:
        inspectInfo = docker_inspect.inspect(container)
        if not inspectInfo["labels"]:
            continue
        gpuIds, labelStr = parseFromLabels(inspectInfo["labels"])
        envStr = parseFromEnv(inspectInfo["env"])
        labelStr = labelStr + envStr
        for id in gpuIds:
            print("gpu id")
            print(id)
            if gpuMetrics:
                print(gpuMetrics)
                containerGpuUtilStr = 'container_GPUPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format(
                    labelStr, id, gpuMetrics[id]["gpuUtil"])
                containerMemUtilStr = 'container_GPUMemPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format(
                    labelStr, id, gpuMetrics[id]["gpuMemUtil"])
                outputFile.write(containerGpuUtilStr)
                outputFile.write(containerMemUtilStr)

        containerCPUPerc = 'container_CPUPerc{{{0}}} {1}\n'.format(
            labelStr, stats[container]["CPUPerc"])
        containerMemUsage = 'container_MemUsage{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemUsage_Limit"]["usage"])
        containerMemLimit = 'container_MemLimit{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemUsage_Limit"]["limit"])
        containerNetIn = 'container_NetIn{{{0}}} {1}\n'.format(
            labelStr, stats[container]["NetIO"]["in"])
        containerNetOut = 'container_NetOut{{{0}}} {1}\n'.format(
            labelStr, stats[container]["NetIO"]["out"])
        containerBlockIn = 'container_BlockIn{{{0}}} {1}\n'.format(
            labelStr, stats[container]["BlockIO"]["in"])
        containerBlockOut = 'container_BlockOut{{{0}}} {1}\n'.format(
            labelStr, stats[container]["BlockIO"]["out"])
        containerMemPerc = 'container_MemPerc{{{0}}} {1}\n'.format(
            labelStr, stats[container]["MemPerc"])
        outputFile.write(containerCPUPerc)
        outputFile.write(containerMemUsage)
        outputFile.write(containerMemLimit)
        outputFile.write(containerNetIn)
        outputFile.write(containerNetOut)
        outputFile.write(containerBlockIn)
        outputFile.write(containerBlockOut)
        outputFile.write(containerMemPerc)
Exemple #5
0
    def collect_impl(self):
        all_conns = network.iftop(self.network_interface,
                ContainerCollector.iftop_histogram,
                ContainerCollector.iftop_timeout)

        # set it to None so if nvidia-smi hangs till next time we get,
        # we will get None
        gpu_infos = self.gpu_info_ref.get_and_set(None)

        stats_obj = docker_stats.stats(ContainerCollector.stats_histogram,
                ContainerCollector.stats_timeout)
        self.stats_info_ref.get_and_set(stats_obj)

        logger.debug("all_conns is %s, gpu_info is %s, stats_obj is %s",
                all_conns, gpu_infos, stats_obj)

        return self.collect_container_metrics(stats_obj, gpu_infos, all_conns)
Exemple #6
0
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies):
    stats_obj = docker_stats.stats()
    if stats_obj is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats_obj.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:]  # remove "k8s_" prefix
                break

        inspect_info = docker_inspect.inspect(container_id)
        pid = inspect_info["pid"] if inspect_info is not None else None
        inspect_labels = utils.walk_json_field_safe(inspect_info, "labels")

        if not inspect_labels and pai_service_name is None:
            continue  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host network,
        # network statistic from docker is not specific to that container. We have to
        # get network statistic by ourselves.
        lsof_result = network.lsof(pid)
        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.check_output(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info, lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpuIds, otherLabels = parse_from_labels(inspect_info["labels"])
            otherLabels.update(inspect_info["env"])

            for id in gpuIds:
                if gpu_infos:
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(
                        Metric("container_GPUPerc", labels,
                               gpu_infos[id]["gpuUtil"]))
                    result.append(
                        Metric("container_GPUMemPerc", labels,
                               gpu_infos[id]["gpuMemUtil"]))

            result.append(
                Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(
                Metric("container_MemUsage", otherLabels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("container_MemLimit", otherLabels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, net_in))
            result.append(Metric("container_NetOut", otherLabels, net_out))
            result.append(
                Metric("container_BlockIn", otherLabels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("container_BlockOut", otherLabels,
                       stats["BlockIO"]["out"]))
            result.append(
                Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(
                Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(
                Metric("service_mem_usage_byte", labels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("service_mem_limit_byte", labels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(
                Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, net_in))
            result.append(Metric("service_net_out_byte", labels, net_out))
            result.append(
                Metric("service_block_in_byte", labels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("service_block_out_byte", labels,
                       stats["BlockIO"]["out"]))

    result.extend(
        generate_zombie_count(stats_obj, type1_zombies, type2_zombies))

    return result