class OverLimit: k = Kubectl() wait_time = 30 metric_item_list = ["cpu_value", "memory_value"] limit_item_list = ["pod_cpu_limits", "pod_memory_limits"] request_item_list = ["pod_cpu_requests", "pod_memory_requests"] app_list = {} app_name = "" namespace = "" cpu_limit = 0 mem_limit = 0 oc = OC() app_type = "" prometheus = Prometheus() def __init__(self): app_namespace = os.environ.get("NAMESPACE") or "nginx" app_type = os.environ.get("RESOURCE_TYPE") or "deployment" resource = os.environ.get("RESOURCE") or "nginx" self.namespace = app_namespace self.app_name = resource self.app_type = app_type def find_deploymentconfig_by_namespace(self, app_name): deployment_name_list = [] output = {} if self.app_type == "deployment": output = self.oc.get_deployment(self.namespace) if self.app_type == "deploymentconfig": output = self.oc.get_deploymentconfig(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: deployment_name = line.split()[0] deployment_name_list.append(deployment_name) return deployment_name_list def find_pod_by_namespace(self, app_name): pod_name_list = [] output = self.oc.get_pods(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue pod_name_list.append(pod_name) return pod_name_list def get_deploymentconfig(self): self.app_list = {} # print ("---get deployment info---") deployment_name_list = self.find_deploymentconfig_by_namespace( self.app_name) for deployment in deployment_name_list: self.app_list[deployment] = {} # print self.app_list def get_pod_info(self): # print ("---get pod info---") pod_name_list = self.find_pod_by_namespace(self.app_name) for pod_name in pod_name_list: for deployment in self.app_list.keys(): if pod_name.find(deployment) != -1: self.app_list[deployment][pod_name] = {} # print self.app_list def get_metrics(self): # print ("---get metrics---") self.kubectl = Kubectl() for metric_item in self.metric_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment]: self.app_list[deployment][pod_name][metric_item] = 0 for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): output = self.kubectl.top_pod(pod_name, self.namespace) for line in output.split("\n"): if line.find(pod_name) != -1: # by kubectl top cpu = int(line.split()[-2].strip("m")) # mCore memory = int(line.split()[-1].strip("Mi")) # MB self.app_list[deployment][pod_name]["cpu_value"] = cpu self.app_list[deployment][pod_name][ "memory_value"] = memory # print self.app_list def get_pod_limit(self, pod_name): # print ("---get pod limit---") cpu_limit = d_cpu_limit memory_limit = d_memory_limit # data collect interval needs less than 30s # return cpu/memory limit from setting directly return cpu_limit, memory_limit output = self.oc.get_pod_json(pod_name, self.namespace) if output: try: output = json.loads(output) cpu_limit1 = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("cpu") if cpu_limit1 and cpu_limit1.find("m") != -1: cpu_limit = float(cpu_limit1.split("m")[0]) else: cpu_limit = float(cpu_limit1) * 1000 memory_limit1 = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("memory") if memory_limit1 and memory_limit1.find("M") != -1: memory_limit = float(memory_limit1.split("M")[0]) elif memory_limit1 and memory_limit1.find("G") != -1: memory_limit = float(memory_limit1.split("G")[0]) * 1000 except Exception as e: print "failed to get limits: %s" % str(e) return cpu_limit, memory_limit def get_limits(self): output = {} for metric_item in self.limit_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): cpu_limit, memory_limit = self.get_pod_limit(pod_name) if metric_item == "pod_cpu_limits": self.app_list[deployment][pod_name][ metric_item] = cpu_limit else: self.app_list[deployment][pod_name][ metric_item] = memory_limit def get_pod_reason(self, pod_name): reason_list = [] output = self.oc.get_pod_json(pod_name, self.namespace) if output: output = json.loads(output) if output.get("status").get("containerStatuses")[0].get( "lastState"): terminated = output.get("status").get( "containerStatuses")[0].get("lastState").get("terminated") reason_list.append(terminated) return reason_list def get_status(self): output = self.oc.get_pods(self.namespace) for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): for line in output.split("\n"): if line.find(self.app_name) != -1: pod = line.split()[0] if pod == pod_name: reason_list = self.get_pod_reason(pod_name) status = line.split()[2] restart = int(line.split()[3]) self.app_list[deployment][pod_name][ "status"] = status self.app_list[deployment][pod_name][ "restart"] = restart self.app_list[deployment][pod_name][ "reason"] = reason_list def get_node_status(self): # print "get node status" node_info = {} output = self.oc.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] status = line.split()[1] node_info[node_name] = {} node_info[node_name]["status"] = status usage_output = self.k.top_node(node_name) for line in usage_output.split("\n"): if line.find(node_name) != -1: cpu = int(line.split()[1].split("m")[0]) memory = int(line.split()[3].split("Mi")[0]) node_info[node_name]["cpu"] = cpu node_info[node_name]["memory"] = memory # print node_info return node_info def get_http_requests(self): #query = "%s{namespace=\"%s\"}" % (ingress_http_requests_name, ingress_namespace) query = "sum(idelta(haproxy_server_http_responses_total{exported_namespace=\"nginx\",route=\"nginx-service\",code=\"2xx\"}[2m]))" output = self.prometheus.query_value(query) return float(output) / 2.0 def calculate_overlimit(self, algo, time_count): cpu_count = 0 memory_count = 0 count = 0 total_restart = 0 total_terminated = 0 data_count = int(time_count * 60 / self.wait_time) print "--- %s collect data and write to logs for %d minutes ---" % ( algo.split("_")[0].upper(), time_count) start_time = time.time() for i in range(data_count): self.get_deploymentconfig() self.get_pod_info() self.get_limits() self.get_metrics() # self.get_status() print "--- %s start to collect data at %d/%d interval(in 30 sec), start: %s, current: %s ---" % ( algo.split("_")[0], i, data_interval * 2, start_time, time.time()) for deployment in self.app_list.keys(): cpu_limit = 0 memory_limit = 0 total_cpu = 0 total_memory = 0 total_cpu_limit = 0 total_memory_limit = 0 # pod for pod in self.app_list[deployment].keys(): if self.app_list[deployment][pod].get("pod_cpu_limits"): cpu_limit = self.app_list[deployment][pod][ "pod_cpu_limits"] memory_limit = self.app_list[deployment][pod][ "pod_memory_limits"] cpu = self.app_list[deployment][pod]["cpu_value"] memory = self.app_list[deployment][pod]["memory_value"] total_cpu += cpu total_memory += memory total_cpu_limit += cpu_limit total_memory_limit += memory_limit if cpu >= cpu_limit and cpu_limit != 0: cpu_count += 1 if memory >= memory_limit and memory_limit != 0: memory_count += 1 restart = self.app_list[deployment][pod].get("restart", 0) total_restart += restart reason = self.app_list[deployment][pod].get("reason", []) total_terminated += len(reason) num_replica = len(self.app_list[deployment].keys()) # http requests http_requests = self.get_http_requests() print self.app_name, "total_cpu=", total_cpu, "m" print self.app_name, "total_memory=", total_memory, "Mi" print self.app_name, "current replica=%d" % num_replica print self.app_name, "overflow=", cpu_count, "times" print self.app_name, "oom=", memory_count, "times" print self.app_name, "restart=", total_restart, "times" print self.app_name, "terminated=", total_terminated, "times" print self.app_name, "http_requests=%s" % http_requests print "\n" total_status = 0 total_node_cpu = 0 total_node_memory = 0 # # skip collect node info (take too long) # node #node_info = self.get_node_status() #for node in node_info.keys(): # if node_info[node].get("status").find("NotReady") != -1: # total_status += 1 # total_node_cpu += node_info[node]["cpu"] # total_node_memory += node_info[node]["memory"] algo_name = "%s-%s" % (self.app_name, algo) data = [ algo_name, total_cpu, total_cpu_limit, total_memory, total_memory_limit, cpu_count, memory_count, num_replica, restart, total_status, total_node_cpu, total_node_memory, http_requests ] self.write_metric(data) # print "wait %d seconds" % self.wait_time # correct time interval = 30 for j in range(interval): end_time = time.time() if end_time - start_time >= interval: start_time = start_time + interval break time.sleep(1) def write_metric(self, data): # print "write metrics" timestamp = str(int(time.time())) data.append(timestamp) try: pod_name = data[0] fn = "./metrics/%s" % pod_name with open(fn, "a") as f: line = " ".join([str(elem) for elem in data]) f.write("%s\n" % str(line)) except Exception as e: print "failed to write metrics:%s" % str(e)
class WriteLog: k = Kubectl() wait_time = 30 metric_item_list = ["cpu_value", "memory_value"] limit_item_list = ["pod_cpu_limits", "pod_memory_limits"] request_item_list = ["pod_cpu_requests", "pod_memory_requests"] app_list = {} app_name = "" namespace = "" cpu_limit = 0 mem_limit = 0 oc = OC() app_type = "" def __init__(self): pass def find_deploymentconfig_by_namespace(self, app_name): deployment_name_list = [] output = "" if self.app_type == "deployment": output = self.oc.get_deployment(self.namespace) elif self.app_type == "deploymentconfig": output = self.oc.get_deploymentconfig(self.namespace) elif self.app_type == "statefulset": output = self.oc.get_statefulset(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: deployment_name = line.split()[0] deployment_name_list.append(deployment_name) return deployment_name_list def find_pod_by_namespace(self, app_name): pod_name_list = [] output = self.oc.get_pods(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue pod_name_list.append(pod_name) return pod_name_list def get_deploymentconfig(self): self.app_list = {} # print ("---get deployment info---") deployment_name_list = self.find_deploymentconfig_by_namespace( self.app_name) for deployment in deployment_name_list: self.app_list[deployment] = {} # print self.app_list def get_pod_info(self): # print ("---get pod info---") pod_name_list = self.find_pod_by_namespace(self.app_name) for pod_name in pod_name_list: for deployment in self.app_list.keys(): if pod_name.find(deployment) != -1: self.app_list[deployment][pod_name] = {} # print self.app_list def get_metrics(self): # print ("---get metrics---") self.kubectl = Kubectl() for metric_item in self.metric_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment]: self.app_list[deployment][pod_name][metric_item] = 0 for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): output = self.kubectl.top_pod(pod_name, self.namespace) for line in output.split("\n"): if line.find(pod_name) != -1: # by kubectl top cpu = int(line.split()[-2].strip("m")) # mCore memory = int(line.split()[-1].strip("Mi")) # MB self.app_list[deployment][pod_name]["cpu_value"] = cpu self.app_list[deployment][pod_name][ "memory_value"] = memory # print self.app_list def get_pod_limit(self, pod_name): #print ("---get pod limit---") cpu_limit = 0 memory_limit = 0 cpu_limit_mcore = "0m" memory_limit_mb = "0Mi" output = self.oc.get_pod_json(pod_name, self.namespace) if output: try: output = json.loads(output) if output.get("spec", {}).get("containers", [])[0].get("resources"): cpu_limit_mcore = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("cpu", "0m") if cpu_limit_mcore and cpu_limit_mcore.find("m") != -1: cpu_limit = float(cpu_limit_mcore.split("m")[0]) else: cpu_limit = float(cpu_limit_mcore) * 1000 if output.get("spec", {}).get("containers", [])[0].get("resources"): memory_limit_mb = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get( "memory", "0Mi") if memory_limit_mb and memory_limit_mb.find("M") != -1: memory_limit = float(memory_limit_mb.split("M")[0]) elif memory_limit_mb and memory_limit_mb.find("G") != -1: memory_limit = float(memory_limit_mb.split("G")[0]) * 1000 except Exception as e: print "failed to get limits: %s" % str(e) return cpu_limit, memory_limit def get_limits(self): output = {} for metric_item in self.limit_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): cpu_limit, memory_limit = self.get_pod_limit(pod_name) if metric_item == "pod_cpu_limits": self.app_list[deployment][pod_name][ metric_item] = cpu_limit else: self.app_list[deployment][pod_name][ metric_item] = memory_limit def get_pod_reason(self, pod_name): reason_list = [] output = self.oc.get_pod_json(pod_name, self.namespace) if output: output = json.loads(output) if output.get("status").get("containerStatuses")[0].get( "lastState"): terminated = output.get("status").get("containerStatuses")[ 0].get("lastState").get("terminated").get("reason") reason_list.append(terminated) return reason_list def get_status(self, is_reason=True): output = self.oc.get_pods(self.namespace) for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): for line in output.split("\n"): if line.find(self.app_name) != -1: pod = line.split()[0] if pod == pod_name: status = line.split()[2] restart = int(line.split()[3]) self.app_list[deployment][pod_name][ "status"] = status self.app_list[deployment][pod_name][ "restart"] = restart if is_reason: reason_list = self.get_pod_reason(pod_name) self.app_list[deployment][pod_name][ "reason"] = reason_list def get_node_status(self): # print "get node status" node_info = {} output = self.oc.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] status = line.split()[1] node_info[node_name] = {} node_info[node_name]["status"] = status usage_output = self.k.top_node(node_name) for line in usage_output.split("\n"): if line.find(node_name) != -1: cpu = int(line.split()[1].split("m")[0]) memory = int(line.split()[3].split("Mi")[0]) node_info[node_name]["cpu"] = cpu node_info[node_name]["memory"] = memory # print node_info return node_info def calculate_overlimit(self, algo, time_count): cpu_count = 0 memory_count = 0 count = 0 total_restart = 0 total_terminated = 0 data_count = int(time_count * 60 / self.wait_time) print "--- %s collect data and write to logs for %d minutes ---" % ( algo.split("_")[0].upper(), time_count) for i in range(data_count): start_time = time.time() self.get_deploymentconfig() self.get_pod_info() self.get_limits() self.get_metrics() self.get_status() print "--- %s start to collect data at %d/%d interval(in 30 sec) ---" % ( algo.split("_")[0], i, data_interval * 2) for deployment in self.app_list.keys(): cpu_limit = 0 memory_limit = 0 total_cpu = 0 total_memory = 0 total_cpu_limit = 0 total_memory_limit = 0 # pod for pod in self.app_list[deployment].keys(): if self.app_list[deployment][pod].get("pod_cpu_limits"): cpu_limit = self.app_list[deployment][pod][ "pod_cpu_limits"] memory_limit = self.app_list[deployment][pod][ "pod_memory_limits"] cpu = self.app_list[deployment][pod]["cpu_value"] memory = self.app_list[deployment][pod]["memory_value"] total_cpu += cpu total_memory += memory total_cpu_limit += cpu_limit total_memory_limit += memory_limit if cpu >= cpu_limit and cpu_limit != 0: cpu_count += 1 if memory >= memory_limit and memory_limit != 0: memory_count += 1 restart = self.app_list[deployment][pod].get("restart", 0) total_restart += restart reason = self.app_list[deployment][pod].get("reason", []) total_terminated += len(reason) num_replica = len(self.app_list[deployment].keys()) print self.app_name, "total_cpu=", total_cpu, "m" print self.app_name, "total_memory=", total_memory, "Mi" print self.app_name, "current replica=%d" % num_replica print self.app_name, "overflow=", cpu_count, "times" print self.app_name, "oom=", memory_count, "times" print self.app_name, "restart=", total_restart, "times" print self.app_name, "terminated=", total_terminated, "times" print "\n" total_status = 0 algo_name = "%s-%s" % (self.app_name, algo) data = [ algo_name, total_cpu, total_cpu_limit, total_memory, total_memory_limit, cpu_count, memory_count, num_replica, restart, total_status ] self.write_metric(data) # print "wait %d seconds" % self.wait_time # correct time interval = 30 for j in range(interval): end_time = time.time() if end_time - start_time >= interval: start_time = start_time + interval break time.sleep(5) def write_metric(self, data): # print "write metrics" timestamp = str(int(time.time())) data.append(timestamp) try: pod_name = data[0] fn = "./metrics/%s" % pod_name with open(fn, "a") as f: line = " ".join([str(elem) for elem in data]) f.write("%s\n" % str(line)) except Exception as e: print "failed to write metrics:%s" % str(e)