def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container in stats: inspectInfo = docker_inspect.inspect(container) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"])) return result
def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) now = datetime.datetime.now() gpu_infos = self.gpu_info_ref.get(now) self.stats_info_ref.set(stats_obj, now) dcgm_infos = self.dcgm_info_ref.get(now) infiniband_infos = self.infiniband_info_ref.get(now) ipoib_infos = self.ipoib_info_ref.get(now) logger.debug("all_conns is %s", all_conns) logger.debug("gpu_info is %s", gpu_infos) logger.debug("stats_obj is %s", stats_obj) logger.debug("dcgm_infos is %s", dcgm_infos) logger.debug("infiniband_infos is %s", infiniband_infos) logger.debug("ipoib_infos is %s", ipoib_infos) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns, dcgm_infos, infiniband_infos, ipoib_infos)
def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break if pai_service_name is None: inspectInfo = docker_inspect.inspect(container_id) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"])) result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"])) result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) return result
def genJobMetrics(logDir, gpuMetrics): stats = docker_stats.stats() outputFile = open(logDir + "/job_exporter.prom", "w") for container in stats: inspectInfo = docker_inspect.inspect(container) if not inspectInfo["labels"]: continue gpuIds, labelStr = parseFromLabels(inspectInfo["labels"]) envStr = parseFromEnv(inspectInfo["env"]) labelStr = labelStr + envStr for id in gpuIds: print("gpu id") print(id) if gpuMetrics: print(gpuMetrics) containerGpuUtilStr = 'container_GPUPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format( labelStr, id, gpuMetrics[id]["gpuUtil"]) containerMemUtilStr = 'container_GPUMemPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format( labelStr, id, gpuMetrics[id]["gpuMemUtil"]) outputFile.write(containerGpuUtilStr) outputFile.write(containerMemUtilStr) containerCPUPerc = 'container_CPUPerc{{{0}}} {1}\n'.format( labelStr, stats[container]["CPUPerc"]) containerMemUsage = 'container_MemUsage{{{0}}} {1}\n'.format( labelStr, stats[container]["MemUsage_Limit"]["usage"]) containerMemLimit = 'container_MemLimit{{{0}}} {1}\n'.format( labelStr, stats[container]["MemUsage_Limit"]["limit"]) containerNetIn = 'container_NetIn{{{0}}} {1}\n'.format( labelStr, stats[container]["NetIO"]["in"]) containerNetOut = 'container_NetOut{{{0}}} {1}\n'.format( labelStr, stats[container]["NetIO"]["out"]) containerBlockIn = 'container_BlockIn{{{0}}} {1}\n'.format( labelStr, stats[container]["BlockIO"]["in"]) containerBlockOut = 'container_BlockOut{{{0}}} {1}\n'.format( labelStr, stats[container]["BlockIO"]["out"]) containerMemPerc = 'container_MemPerc{{{0}}} {1}\n'.format( labelStr, stats[container]["MemPerc"]) outputFile.write(containerCPUPerc) outputFile.write(containerMemUsage) outputFile.write(containerMemLimit) outputFile.write(containerNetIn) outputFile.write(containerNetOut) outputFile.write(containerBlockIn) outputFile.write(containerBlockOut) outputFile.write(containerMemPerc)
def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) # set it to None so if nvidia-smi hangs till next time we get, # we will get None gpu_infos = self.gpu_info_ref.get_and_set(None) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) self.stats_info_ref.get_and_set(stats_obj) logger.debug("all_conns is %s, gpu_info is %s, stats_obj is %s", all_conns, gpu_infos, stats_obj) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns)
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies): stats_obj = docker_stats.stats() if stats_obj is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats_obj.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break inspect_info = docker_inspect.inspect(container_id) pid = inspect_info["pid"] if inspect_info is not None else None inspect_labels = utils.walk_json_field_safe(inspect_info, "labels") if not inspect_labels and pai_service_name is None: continue # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host network, # network statistic from docker is not specific to that container. We have to # get network statistic by ourselves. lsof_result = network.lsof(pid) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.check_output( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info, lsof_result, net_in, net_out) if pai_service_name is None: gpuIds, otherLabels = parse_from_labels(inspect_info["labels"]) otherLabels.update(inspect_info["env"]) for id in gpuIds: if gpu_infos: labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append( Metric("container_GPUPerc", labels, gpu_infos[id]["gpuUtil"])) result.append( Metric("container_GPUMemPerc", labels, gpu_infos[id]["gpuMemUtil"])) result.append( Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append( Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, net_in)) result.append(Metric("container_NetOut", otherLabels, net_out)) result.append( Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append( Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append( Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append( Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append( Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append( Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, net_in)) result.append(Metric("service_net_out_byte", labels, net_out)) result.append( Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append( Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) result.extend( generate_zombie_count(stats_obj, type1_zombies, type2_zombies)) return result