def parse_docker_inspect(inspect_output): obj = json.loads(inspect_output) labels = {} envs = {} obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels") if obj_labels is not None: for key in obj_labels: if key in targetLabel: labelKey = "container_label_{0}".format(key.replace(".", "_")) labelVal = obj_labels[key] labels[labelKey] = labelVal obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env") if obj_env: for env in obj_env: envItem = env.split("=") if envItem[0] in targetEnv: envKey = "container_env_{0}".format(envItem[0].replace( ".", "_")) envVal = envItem[1] envs[envKey] = envVal pid = utils.walk_json_field_safe(obj, 0, "State", "Pid") return {"env": envs, "labels": labels, "pid": pid}
def parse_docker_inspect(inspect_output): obj = json.loads(inspect_output) m = {} obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels") if obj_labels is not None: for k, v in obj_labels.items(): if k in keys: m[k] = v obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env") if obj_env: for env in obj_env: k, v = env.split("=", 1) if k in keys: m[k] = v # for kube-launcher tasks if k == "FC_TASK_INDEX": m["PAI_TASK_INDEX"] = v elif k == "NVIDIA_VISIBLE_DEVICES" and v != "all" and v != "void": m["GPU_ID"] = v if k == "FC_FRAMEWORK_ATTEMPT_INSTANCE_UID" or k == "APP_ID": m["JOB_INSTANCE_ID"] = v pid = utils.walk_json_field_safe(obj, 0, "State", "Pid") return InspectResult( m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"), m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"), m.get("PAI_CURRENT_TASK_ROLE_NAME"), m.get("PAI_TASK_INDEX"), m.get("GPU_ID"), m.get("JOB_INSTANCE_ID"), pid)
def parse_docker_inspect(inspect_output): obj = json.loads(inspect_output) labels = {} envs = {} obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels") if obj_labels is not None: for key in obj_labels: if key in target_label: label_key = "container_label_{0}".format(key) label_val = obj_labels[key] labels[label_key] = label_val obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env") if obj_env: for env in obj_env: k, v = env.split("=", 1) if k in target_env: key = "container_env_{0}".format(k) envs[key] = v # for kube-launcher tasks if k in target_label: label_key = "container_label_{0}".format(k) labels[label_key] = v if k == "FC_TASK_INDEX": envs["container_env_PAI_TASK_INDEX"] = v pid = utils.walk_json_field_safe(obj, 0, "State", "Pid") return {"env": envs, "labels": labels, "pid": pid}
def test_walk_json_field_safe(self): self.assertIsNone(utils.walk_json_field_safe(None, 1, "abc")) self.assertIsNone(utils.walk_json_field_safe([], 1, "abc")) self.assertIsNone(utils.walk_json_field_safe([{"abc"}], 1, "abc")) self.assertEqual( "345", utils.walk_json_field_safe([{ "name": "123" }, { "name": "345" }], 1, "name"))
def parse_docker_inspect(inspect_output): obj = json.loads(inspect_output) m = {} obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels") if obj_labels is not None: for k, v in obj_labels.items(): if k in keys: m[k] = v obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env") if obj_env: for env in obj_env: k, v = env.split("=", 1) if k in keys: m[k] = v # for kube-launcher tasks if k == "FC_TASK_INDEX": m["PAI_TASK_INDEX"] = v elif k == "NVIDIA_VISIBLE_DEVICES" and v != "all" and v != "void": m["GPU_ID"] = v pid = utils.walk_json_field_safe(obj, 0, "State", "Pid") logger.info("m is %s", m) return InspectResult( select_value_with_key( m, ["PAI_USER_NAME", "DLWS_USER_NAME", "DLTS_USER_NAME"]), select_value_with_key(m, ["PAI_JOB_NAME", "DLWS_JOB_ID", "DLTS_JOB_ID"]), select_value_with_key( m, ["PAI_CURRENT_TASK_ROLE_NAME", "DLWS_ROLE_NAME", "DLTS_ROLE_NAME" ]), select_value_with_key(m, [ "PAI_TASK_INDEX", "DLWS_ROLE_IDX", "DLTS_ROLE_IDX", "FC_TASK_INDEX" ]), select_value_with_key(m, ["POD_NAME", "PAI_JOB_NAME"]), m.get("GPU_ID"), pid, select_value_with_key(m, ["DLWS_USER_EMAIL", "DLTS_USER_EMAIL"]), select_value_with_key(m, ["DLWS_VC_NAME", "DLTS_VC_NAME"]), m.get("DLWS_HOST_NETWORK") == "enable" or m.get("DLTS_HOST_NETWORK") == "enable", )
def collect_container_metrics(self, stats_obj, gpu_infos, all_conns): if stats_obj is None: logger.warning("docker stats returns None") return None gauges = ResourceGauges() for container_id, stats in stats_obj.items(): try: self.process_one_container(container_id, stats, gpu_infos, all_conns, gauges) except Exception: logger.exception("error when trying to process container %s with name %s", container_id, utils.walk_json_field_safe(stats, "name")) return gauges.as_array()
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout, self.gpu_vendor) pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host # network, and network statistic from docker is not specific to that # container. We have to get network statistic by ourselves. lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram, ContainerCollector.lsof_timeout) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.exec_cmd( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info.strip(), lsof_result, net_in, net_out) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies): stats_obj = docker_stats.stats() if stats_obj is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats_obj.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break inspect_info = docker_inspect.inspect(container_id) pid = inspect_info["pid"] if inspect_info is not None else None inspect_labels = utils.walk_json_field_safe(inspect_info, "labels") if not inspect_labels and pai_service_name is None: continue # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host network, # network statistic from docker is not specific to that container. We have to # get network statistic by ourselves. lsof_result = network.lsof(pid) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.check_output( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info, lsof_result, net_in, net_out) if pai_service_name is None: gpuIds, otherLabels = parse_from_labels(inspect_info["labels"]) otherLabels.update(inspect_info["env"]) for id in gpuIds: if gpu_infos: labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append( Metric("container_GPUPerc", labels, gpu_infos[id]["gpuUtil"])) result.append( Metric("container_GPUMemPerc", labels, gpu_infos[id]["gpuMemUtil"])) result.append( Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append( Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, net_in)) result.append(Metric("container_NetOut", otherLabels, net_out)) result.append( Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append( Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append( Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append( Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append( Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append( Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, net_in)) result.append(Metric("service_net_out_byte", labels, net_out)) result.append( Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append( Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) result.extend( generate_zombie_count(stats_obj, type1_zombies, type2_zombies)) return result
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name(container_name) inspect_info = docker_inspect.inspect(container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) if inspect_info is None: return pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, if container is host network, we will treat # node network consumption as container consumption. If not, use data # from docker state. # This will result network consumption of service using host network # equals to node network consumption. is_host_network = inspect_info.is_host_network if is_host_network: net_in, net_out = network.get_network_consumption( self.network_interface) else: net_in, net_out = network.get_non_host_network_consumption(pid) if pai_service_name is None: gpu_ids,npu_ids,container_labels = ContainerCollector.parse_from_labels(inspect_info, gpu_infos) logger.info("start to collect metric for jobId: %s",container_labels["job_name"]) if container_labels["username"] == "unknown": logger.warn("jobId: %s has none username,pass!" %(container_labels["job_name"])) return if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] uuid = nvidia_gpu_status.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["uuid"] = uuid labels["device_str"] = "nvidia.com/gpu" gauges.add_value("task_device_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_device_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) if npu_infos: for id in npu_ids: if npu_infos.get(id) is None: continue npu_status = npu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["device_str"] = "npu.huawei.com/NPU" ### each npu device should have one unique string labels["uuid"] = id if inspect_info.node_name: labels["uuid"] =inspect_info.node_name+ "_" + str(id) gauges.add_value("task_device_percent", labels, npu_status.npu_util) gauges.add_value("task_device_mem_percent", labels, npu_status.npu_mem_util) if dcgm_infos: for id in gpu_ids: if dcgm_infos.get(id) is None: continue dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics uuid = dcgm_metric.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_dcgm_metric(dcgm_metric, labels) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges, dcgm_infos, infiniband_infos, ipoib_infos): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) if inspect_info is None: logger.debug("ignore killed container %s", container_id) return pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, if container is host network, we will treat # node network consumption as container consumption. If not, use data # from docker state. # This will result network consumption of service using host network # equals to node network consumption. is_host_network = inspect_info.is_host_network if is_host_network: net_in, net_out = network.get_network_consumption( self.network_interface) else: net_in, net_out = network.get_non_host_network_consumption(pid) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] uuid = nvidia_gpu_status.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) if dcgm_infos: for id in gpu_ids: if dcgm_infos.get(id) is None: continue dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics uuid = dcgm_metric.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_dcgm_metric(dcgm_metric, labels) if is_host_network: if infiniband_infos: for infiniband_info in infiniband_infos: labels = copy.deepcopy(container_labels) labels.update(infiniband_info.labels) gauges.add_value("task_infiniband_receive_bytes_total", labels, infiniband_info.receive_bytes) gauges.add_value( "task_infiniband_transmit_bytes_total", labels, infiniband_info.transmit_bytes) if ipoib_infos: for ipoib_info in ipoib_infos: labels = copy.deepcopy(container_labels) labels.update(ipoib_info.labels) gauges.add_value("task_ipoib_receive_bytes_total", labels, ipoib_info.receive_bytes) gauges.add_value("task_ipoib_transmit_bytes_total", labels, ipoib_info.transmit_bytes) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])