class WriteLog: k = Kubectl() wait_time = 30 metric_item_list = ["cpu_value", "memory_value"] limit_item_list = ["pod_cpu_limits", "pod_memory_limits"] request_item_list = ["pod_cpu_requests", "pod_memory_requests"] app_list = {} app_name = "" namespace = "" cpu_limit = 0 mem_limit = 0 oc = OC() app_type = "" def __init__(self): pass def find_deploymentconfig_by_namespace(self, app_name): deployment_name_list = [] output = "" if self.app_type == "deployment": output = self.oc.get_deployment(self.namespace) elif self.app_type == "deploymentconfig": output = self.oc.get_deploymentconfig(self.namespace) elif self.app_type == "statefulset": output = self.oc.get_statefulset(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: deployment_name = line.split()[0] deployment_name_list.append(deployment_name) return deployment_name_list def find_pod_by_namespace(self, app_name): pod_name_list = [] output = self.oc.get_pods(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue pod_name_list.append(pod_name) return pod_name_list def get_deploymentconfig(self): self.app_list = {} # print ("---get deployment info---") deployment_name_list = self.find_deploymentconfig_by_namespace( self.app_name) for deployment in deployment_name_list: self.app_list[deployment] = {} # print self.app_list def get_pod_info(self): # print ("---get pod info---") pod_name_list = self.find_pod_by_namespace(self.app_name) for pod_name in pod_name_list: for deployment in self.app_list.keys(): if pod_name.find(deployment) != -1: self.app_list[deployment][pod_name] = {} # print self.app_list def get_metrics(self): # print ("---get metrics---") self.kubectl = Kubectl() for metric_item in self.metric_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment]: self.app_list[deployment][pod_name][metric_item] = 0 for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): output = self.kubectl.top_pod(pod_name, self.namespace) for line in output.split("\n"): if line.find(pod_name) != -1: # by kubectl top cpu = int(line.split()[-2].strip("m")) # mCore memory = int(line.split()[-1].strip("Mi")) # MB self.app_list[deployment][pod_name]["cpu_value"] = cpu self.app_list[deployment][pod_name][ "memory_value"] = memory # print self.app_list def get_pod_limit(self, pod_name): #print ("---get pod limit---") cpu_limit = 0 memory_limit = 0 cpu_limit_mcore = "0m" memory_limit_mb = "0Mi" output = self.oc.get_pod_json(pod_name, self.namespace) if output: try: output = json.loads(output) if output.get("spec", {}).get("containers", [])[0].get("resources"): cpu_limit_mcore = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("cpu", "0m") if cpu_limit_mcore and cpu_limit_mcore.find("m") != -1: cpu_limit = float(cpu_limit_mcore.split("m")[0]) else: cpu_limit = float(cpu_limit_mcore) * 1000 if output.get("spec", {}).get("containers", [])[0].get("resources"): memory_limit_mb = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get( "memory", "0Mi") if memory_limit_mb and memory_limit_mb.find("M") != -1: memory_limit = float(memory_limit_mb.split("M")[0]) elif memory_limit_mb and memory_limit_mb.find("G") != -1: memory_limit = float(memory_limit_mb.split("G")[0]) * 1000 except Exception as e: print "failed to get limits: %s" % str(e) return cpu_limit, memory_limit def get_limits(self): output = {} for metric_item in self.limit_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): cpu_limit, memory_limit = self.get_pod_limit(pod_name) if metric_item == "pod_cpu_limits": self.app_list[deployment][pod_name][ metric_item] = cpu_limit else: self.app_list[deployment][pod_name][ metric_item] = memory_limit def get_pod_reason(self, pod_name): reason_list = [] output = self.oc.get_pod_json(pod_name, self.namespace) if output: output = json.loads(output) if output.get("status").get("containerStatuses")[0].get( "lastState"): terminated = output.get("status").get("containerStatuses")[ 0].get("lastState").get("terminated").get("reason") reason_list.append(terminated) return reason_list def get_status(self, is_reason=True): output = self.oc.get_pods(self.namespace) for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): for line in output.split("\n"): if line.find(self.app_name) != -1: pod = line.split()[0] if pod == pod_name: status = line.split()[2] restart = int(line.split()[3]) self.app_list[deployment][pod_name][ "status"] = status self.app_list[deployment][pod_name][ "restart"] = restart if is_reason: reason_list = self.get_pod_reason(pod_name) self.app_list[deployment][pod_name][ "reason"] = reason_list def get_node_status(self): # print "get node status" node_info = {} output = self.oc.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] status = line.split()[1] node_info[node_name] = {} node_info[node_name]["status"] = status usage_output = self.k.top_node(node_name) for line in usage_output.split("\n"): if line.find(node_name) != -1: cpu = int(line.split()[1].split("m")[0]) memory = int(line.split()[3].split("Mi")[0]) node_info[node_name]["cpu"] = cpu node_info[node_name]["memory"] = memory # print node_info return node_info def calculate_overlimit(self, algo, time_count): cpu_count = 0 memory_count = 0 count = 0 total_restart = 0 total_terminated = 0 data_count = int(time_count * 60 / self.wait_time) print "--- %s collect data and write to logs for %d minutes ---" % ( algo.split("_")[0].upper(), time_count) for i in range(data_count): start_time = time.time() self.get_deploymentconfig() self.get_pod_info() self.get_limits() self.get_metrics() self.get_status() print "--- %s start to collect data at %d/%d interval(in 30 sec) ---" % ( algo.split("_")[0], i, data_interval * 2) for deployment in self.app_list.keys(): cpu_limit = 0 memory_limit = 0 total_cpu = 0 total_memory = 0 total_cpu_limit = 0 total_memory_limit = 0 # pod for pod in self.app_list[deployment].keys(): if self.app_list[deployment][pod].get("pod_cpu_limits"): cpu_limit = self.app_list[deployment][pod][ "pod_cpu_limits"] memory_limit = self.app_list[deployment][pod][ "pod_memory_limits"] cpu = self.app_list[deployment][pod]["cpu_value"] memory = self.app_list[deployment][pod]["memory_value"] total_cpu += cpu total_memory += memory total_cpu_limit += cpu_limit total_memory_limit += memory_limit if cpu >= cpu_limit and cpu_limit != 0: cpu_count += 1 if memory >= memory_limit and memory_limit != 0: memory_count += 1 restart = self.app_list[deployment][pod].get("restart", 0) total_restart += restart reason = self.app_list[deployment][pod].get("reason", []) total_terminated += len(reason) num_replica = len(self.app_list[deployment].keys()) print self.app_name, "total_cpu=", total_cpu, "m" print self.app_name, "total_memory=", total_memory, "Mi" print self.app_name, "current replica=%d" % num_replica print self.app_name, "overflow=", cpu_count, "times" print self.app_name, "oom=", memory_count, "times" print self.app_name, "restart=", total_restart, "times" print self.app_name, "terminated=", total_terminated, "times" print "\n" total_status = 0 algo_name = "%s-%s" % (self.app_name, algo) data = [ algo_name, total_cpu, total_cpu_limit, total_memory, total_memory_limit, cpu_count, memory_count, num_replica, restart, total_status ] self.write_metric(data) # print "wait %d seconds" % self.wait_time # correct time interval = 30 for j in range(interval): end_time = time.time() if end_time - start_time >= interval: start_time = start_time + interval break time.sleep(5) def write_metric(self, data): # print "write metrics" timestamp = str(int(time.time())) data.append(timestamp) try: pod_name = data[0] fn = "./metrics/%s" % pod_name with open(fn, "a") as f: line = " ".join([str(elem) for elem in data]) f.write("%s\n" % str(line)) except Exception as e: print "failed to write metrics:%s" % str(e)
def check_system_pods_ready(conf, platform): kubectl = Kubectl(conf, platform) return check_pods_ready(kubectl, namespace="kube-system")
def kubectl(conf): return Kubectl(conf)
class Zookeeper(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Zookeeper, self).__init__() self.namespace = "myproject" self.app_name = "my-cluster-zookeeper" self.app_type = "statefulset" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 if status in ["OOMKilled"]: app_status_oomkilled += 1 print "- Zookeepers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 num_replica = len(self.w.app_list[self.app_name].keys()) print "- Zookeepers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit, app_memory_overlimit) output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def write_logs(self, algo_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() file_name = "%s/%s_zookeeper_metrics" % (traffic_path, algo_name) timestamp = int(time.time()) line = "%s " % (timestamp) line += self.calculate_pod_info() line += self.calculate_overlimit() line += "\n" try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write zookeeper logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write zookeeper logs(%s)" % file_name return 0
class Consumer(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Consumer, self).__init__() self.namespace = "myproject" self.app_name = "consumer" self.app_type = "deployment" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 elif item == "reason": reason_list = self.w.app_list[self.app_name][pod]["reason"] for reason in reason_list: if reason == "OOMKilled": app_status_oomkilled += 1 print "- Consumers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s OOMKilled %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_oomkilled) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 num_replica = len(self.w.app_list[self.app_name].keys()) print "- Consumers: Replica: %s\n" % (num_replica) output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def calculate_performance(self, group_name, topic_name): total_lag = 0 total_log_offset = 0 total_current_offset = 0 active_client = 0 inactive_client = 0 partition_list = [] active_client_list = [] start_time = time.time() num_sample = 3 # print "--------", group_name, topic_name for i in range(num_sample): output = self.describe_consumer_group(group_name) print "===" print "%s" % output print "===" for line in output.split("\n"): if line and line.find(topic_name) != -1 and line.find( "Error") == -1: partition = int(line.split()[2]) if partition not in partition_list: partition_list.append(partition) current_offset = int(line.split()[3]) log_offset = int(line.split()[4]) lag = int(line.split()[5]) consumer_id = line.split()[6] total_log_offset += log_offset total_current_offset += current_offset total_lag += lag if consumer_id.find("consumer-1") == -1: inactive_client += 1 if consumer_id not in active_client_list: active_client_list.append(consumer_id) # print i, "total describe lag=", lag, time.time() total_lag = total_lag / (num_sample * 1.0) total_log_offset = total_log_offset / (num_sample * 1.0) total_current_offset = total_current_offset / (num_sample * 1.0) inactive_client = inactive_client / (num_sample * 1.0) active_client = len(active_client_list) print "- Consumers: Log Offset %s;" % total_log_offset, "Current Offset %s;" % total_current_offset, "Lag %s;" % total_lag print "- Consumers: Active %s;" % active_client, "Inactive %s" % inactive_client print "\n" output = "%s %s %s %s %s %s %s %s " % ( group_name, topic_name, total_lag, active_client, inactive_client, total_log_offset, total_current_offset, len(partition_list)) end_time = time.time() #print ">> describe time = ", end_time - start_time return output def write_logs(self, algo_name, group_name, topic_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() file_name = "%s/%s_consumer_metrics" % (traffic_path, algo_name) timestamp = int(time.time()) line = "%s " % (timestamp) line += self.calculate_pod_info() line += self.calculate_overlimit() # hungo test - block calculate (per maygy) #line += self.calculate_performance(group_name, topic_name) line += "\n" try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write consumer logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write consumer logs(%s)" % file_name return 0 def delete_all_consumer_groups(self): # delete all consumer groups group_list = self.list_consumer_group() for group in group_list: output = self.delete_consumer_group(group)
def inhibit_kured(options): Kubectl(options.conf).inhibit_kured()
def kubectl(conf, target): return Kubectl(conf, target)
class Training: k = Kubectl() o = OC() n = Nginx() def __init__(self): #self.o.login("admin", "password") test = "" def get_node_list(self): node_list = [] output = self.o.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] node_list.append(node_name) return node_list def get_node_usage(self): # kubectl top node h5-135 # NAME CPU(cores) CPU% MEMORY(bytes) MEMORY% # h5-135 655m 8% 5703Mi 17% node_usage = {} node_usage["cpu"] = {} node_usage["memory"] = {} node_list = self.get_node_list() for node in node_list: output = self.k.top_node(node) for line in output.split("\n"): if line.find("NAME") == -1 and line: cpu_usage = int(line.split()[2].split("%")[0]) memory_usage = int(line.split()[-1].split("%")[0]) node_usage["cpu"][node] = cpu_usage node_usage["memory"][node] = memory_usage avg_node_usage = sum(node_usage["cpu"].values()) / len( node_usage["cpu"].values()) max_node_usage = max(node_usage["cpu"].values()) return max_node_usage, avg_node_usage def get_pod_usage(self, app_name, app_namespace): pod_usage = {} pod_usage["cpu"] = {} pod_usage["memory"] = {} pod_name_list = find_pod_name(app_name, app_namespace) for pod in pod_name_list: output = self.k.top_pod(pod, app_namespace) for line in output.split("\n"): if line.find("NAME") == -1 and line: cpu_usage = int(line.split()[1].split("m")[0]) memory_usage = int(line.split()[-1].split("M")[0]) pod_usage["cpu"][pod] = cpu_usage pod_usage["memory"][pod] = memory_usage avg_pod_usage = sum(pod_usage["cpu"].values()) / len( pod_usage["cpu"].values()) max_pod_usage = max(pod_usage["cpu"].values()) num_pod = len(pod_name_list) return max_pod_usage, avg_pod_usage, num_pod def import_traffic(self, ratio, i): cmd = "python ./run_ab.py %d %d &" % (0, ratio) ret = os.system(cmd) return ret def get_traffic_info(self): dir_name = "./traffic" traffic_file_list = os.listdir(dir_name) latency_list = [] for traffic in traffic_file_list: traffic_file = "./%s/%s" % (dir_name, traffic) if os.path.exists(traffic_file): with open(traffic_file, "r") as f: output = f.read() for line in output.split("\n"): if line.find("Connect: ") != -1: avg_connect_latency = int(line.split()[2]) latency_list.append(avg_connect_latency) return latency_list def collect_usage(self, app_namespace, app_name): data = {} max_node_usage_list = [] avg_node_usage_list = [] max_pod_usage_list = [] avg_pod_usage_list = [] start_time = time.time() timeout = 120 print "collect %ds resource usage" % timeout while True: end_time = time.time() if end_time - start_time > timeout: print "time is up to %ds..." % timeout break max_node_usage, avg_node_usage = self.get_node_usage() max_pod_usage, avg_pod_usage, num_pod = self.get_pod_usage( app_name, app_namespace) self.get_traffic_info() max_node_usage_list.append(max_node_usage) avg_node_usage_list.append(avg_node_usage) max_pod_usage_list.append(max_pod_usage) avg_pod_usage_list.append(avg_pod_usage) time.sleep(5) connect_latency_list = self.get_traffic_info() max_node_usage = sum(max_node_usage_list) / len(max_node_usage_list) avg_node_usage = sum(avg_node_usage_list) / len(avg_node_usage_list) max_pod_usage = sum(max_pod_usage_list) / len(max_pod_usage_list) avg_pod_usage = sum(avg_pod_usage_list) / len(avg_pod_usage_list) avg_connect_latency = sum(connect_latency_list) / len( connect_latency_list) print "max. node =", max_node_usage, "%" print "avg. node =", avg_node_usage, "%" print "max. pod = ", max_pod_usage, "m" print "avg. pod = ", avg_pod_usage, "m" print "avg. connect latency = ", avg_connect_latency, "ms" data["max_node"] = max_node_usage data["avg_node"] = avg_node_usage data["max_pod"] = max_pod_usage data["avg_pod"] = avg_pod_usage data["avg_connect_latency"] = avg_connect_latency return data
class Client(object): oc = OC() kubectl = Kubectl() zookeeper = "" def __init__(self): pass def find_broker_ip(self): ns = "" ip = "" port = "" output = self.oc.get_services_all_namespace() try: for line in output.split("\n"): if line.find("my-cluster") != -1 and line.find("bootstrap") == -1 and line.find("zookeeper") == -1 and line.find("exporter") == -1: ns = line.split()[0] ip = line.split()[1] #port = line.split()[5].split("/")[0].split(":")[0] except Exception as e: print "it cannot find broker ip: %s" % str(e) return ns, ip, port print "find broker ip (%s:%s)" % (ip, port) # Hard core port to 9092 return ns, ip, 9092 def find_zookeeper_ip(self): ns = "" ip = "" port = "" output = self.oc.get_services_all_namespace() try: for line in output.split("\n"): if line.find("zookeeper-client") != -1 and line.find("zookeeper-headless") == -1: ns = line.split()[0] ip = line.split()[1] #port = line.split()[5].split("/")[0].split(":")[0] except Exception as e: print "it cannot find zookeeper ip: %s" % str(e) return ns, ip, port # print "find zookeeper ip (%s:%s)" % (ip, port) # hard code port to 2181 return ns, ip, 2181 def find_producer_pod(self): ns = "" pod_list = [] output = self.oc.get_pods_all_namespace() try: for line in output.split("\n"): if line.find("producer") != -1 and line.find("Running") != -1: ns = line.split()[0] pod = line.split()[1] pod_list.append(pod) except Exception as e: print "it cannot find producer pod: %s" % str(e) return ns, pod_list # print "find %s producers in ns (%s)" % (len(pod_list), ns) return ns, pod_list def find_consumer_pod(self): ns = "" pod_list = [] output = self.oc.get_pods_all_namespace() try: for line in output.split("\n"): if line.find("consumer") != -1 and line.find("Running") != -1: ns = line.split()[0] pod = line.split()[1] pod_list.append(pod) except Exception as e: print "it cannot find consumer pod: %s" % str(e) return ns, pod_list # print "find %s consumers in ns (%s)" % (len(pod_list), ns) return ns, pod_list def find_zookeeper_pod(self): ns = "" pod_list = [] output = self.oc.get_pods_all_namespace() try: for line in output.split("\n"): if line.find("zookeeper-client") != -1: ns = line.split()[0] pod = line.split()[1] pod_list.append(pod) except Exception as e: print "it cannot find consumer pod: %s" % str(e) return ns, pod_list print "find %s zookeepers in ns (%s)" % (len(pod_list), ns) return ns, pod_list def find_broker_pod(self): ns = "" pod_list = [] output = self.oc.get_pods_all_namespace() try: for line in output.split("\n"): if line.find("my-cluster") != -1 and line.find("export") == -1 and line.find("operator") == -1 and line.find("zookeeper") == -1: ns = line.split()[0] pod = line.split()[1] pod_list.append(pod) except Exception as e: print "it cannot find consumer pod: %s" % str(e) return ns, pod_list print "find %s brokers in ns (%s)" % (len(pod_list), ns) return ns, pod_list def list_topic(self): topic_list = [] ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --list" % (ip, port) output = self.oc.exec_cmd(ns, pod, cmd) if not output: print "there is no topics in %s" % pod else: for line in output.split("\n"): if line: item = line.split()[0] if item and item not in topic_list: topic_list.append(item) print "current topics: %s" % ",".join(topic_list) return topic_list def describe_topic(self, topic_name): ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --describe --topic %s" % (ip, port, topic_name) output = self.oc.exec_cmd(ns, pod, cmd) return output def create_topic(self, topic_name): # references: https://blog.csdn.net/u010886217/article/details/83119774 # --replication-factor<=number of brokers # --partitions: 1x or 2x number of brokers ns, broker_list = self.find_broker_pod() ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] partition = len(broker_list) replication = len(broker_list) cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --topic %s --create --partitions %d --replication-factor %d" % (ip, port, topic_name, partition, replication) print cmd output = self.oc.exec_cmd(ns, pod, cmd) print output return output def delete_topic(self, topic_name): print "delete topic:", topic_name ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-topics.sh --delete --bootstrap-server %s:%s --topic %s delete.topic.enable=true" % (ip, port, topic_name) output = self.oc.exec_cmd(ns, pod, cmd) return output def modify_topic(self, topic_name, num_partition): print "modify topic:", topic_name ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-topics.sh --alter --bootstrap-server %s:%s --topic %s --partitions %s" % (ip, port, topic_name, num_partition) print cmd output = self.oc.exec_cmd(ns, pod, cmd) print output return output def list_consumer_group(self): # print "--- list consumer group ---" group_list = [] ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() if not pod_list: raise Exception("consumer is not existed") pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --list" % (ip, port) output = self.oc.exec_cmd(ns, pod, cmd) for group in output.split("\n"): if group and group.find("Note") == -1: group_list.append(group) return group_list def describe_consumer_group(self, consumer_group_name): # print "describe consumer group: ", consumer_group_name ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --describe --group %s" % (ip, port, consumer_group_name) output = self.oc.exec_cmd(ns, pod, cmd) return output def delete_consumer_group(self, consumer_group_name): print "delete consumer group: ", consumer_group_name # only delete consumer group by zookeeper ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() pod = pod_list[0] cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --delete --group %s" % (ip, port, consumer_group_name) print cmd output = self.oc.exec_cmd(ns, pod, cmd) return output def producer_per_test(self, topic_name, message_count): # reference1: https://gist.github.com/ueokande/b96eadd798fff852551b80962862bfb3 # reference2: https://blog.csdn.net/tom_fans/article/details/75517367 # print "--- producer_per_test ---" ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_producer_pod() if not pod_list: raise Exception("producer is not existed") pod_info = {} record_size = message_size for pod in pod_list: pod_info[pod] = {} cmd = "/opt/kafka/bin/kafka-producer-perf-test.sh --topic %s --num-records %s --record-size %s --throughput 1000000 --producer-props bootstrap.servers=%s:%s" % (topic_name, message_count, record_size, ip, port) print cmd output = self.oc.exec_cmd(ns, pod, cmd) #print "%s: " % pod, output if not output: raise Exception("failed to produces messages") print "%s produces %s messages for topic %s" % (pod, message_count, topic_name) try: for line in output.split("\n"): if line and len(line.split()) > 20 and line.find("OpenJDK") == -1: pod_info[pod]["record"] = int(line.split()[0]) pod_info[pod]["throughput"] = float(line.split()[3]) pod_info[pod]["avg_latency"] = float(line.split()[7]) pod_info[pod]["max_latency"] = float(line.split()[11]) pod_info[pod]["50th_latency"] = float(line.split()[15]) pod_info[pod]["95th_latency"] = float(line.split()[18]) pod_info[pod]["99th_latency"] = float(line.split()[21]) pod_info[pod]["99.9th_latency"] = float(line.split()[24]) except Exception as e: print "failed to get producer metrics: %s" % str(e) pod_info[pod]["record"] = 0 pod_info[pod]["throughput"] = 0 pod_info[pod]["avg_latency"] = 0 pod_info[pod]["max_latency"] = 0 pod_info[pod]["50th_latency"] = 0 pod_info[pod]["95th_latency"] = 0 pod_info[pod]["99th_latency"] = 0 pod_info[pod]["99.9th_latency"] = 0 return pod_info def consumer_per_test(self, topic_name, message_count): # reference: https://gist.github.com/ueokande/b96eadd798fff852551b80962862bfb3 print "--- consumer_per_test ---" ns, ip, port = self.find_zookeeper_ip() ns, pod_list = self.find_consumer_pod() if not pod_list: raise Exception("consumer is not existed") pod_info = {} for pod in pod_list: pod_info[pod] = {} cmd = "/opt/kafka/bin/kafka-consumer-perf-test.sh --topic %s --messages %s --zookeeper=%s:%s --threads 1" % (topic_name, message_count, ip, port) # cmd = "/opt/kafka/bin/kafka-run-class.sh kafka.tools.ConsumerPerformance --topic %s --messages %s --zookeeper=%s:%s --threads 1" % (topic_name, message_count, ip, port) print cmd output = self.oc.exec_cmd(ns, pod, cmd) print "%s receives %s messages for topic %s" % (pod, message_count, topic_name) # print output for line in output.split("\n"): if line and line.find("start.time") == -1: pod_info[pod]["MB.sec"] = float(line.split()[-3].split(",")[0]) pod_info[pod]["nMsg.sec"] = float(line.split()[-1]) return pod_info def get_topic_info(self, topic_name): num_partition = 0 output = self.describe_topic(topic_name) try: num_partition = int(output.split()[1].split(":")[1]) except Exception as e: num_partition = 0 print "%s has %s partitions" % (topic_name, num_partition) return num_partition def get_consumer_group_info(self, topic_name, group_name): topic_info = {} output = self.describe_consumer_group(group_name) for line in output.split("\n"): if line.find(topic_name) != -1: partition = int(line.split()[1]) current_offset = int(line.split()[2]) topic_info[partition] = current_offset return topic_info def simple_consumer_shell(self, topic_name, max_messages): # reference: https://segmentfault.com/a/1190000016106045 # print "--- simple-consumer-shell ---" ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_consumer_pod() num_partition = self.get_topic_info(topic_name) topic_info = self.get_consumer_group_info(topic_name, group_name) if num_partition == 0: raise Exception("%s is not existed" % (topic_name)) for pod in pod_list: pod_id = pod_list.index(pod) partition = pod_id % num_partition current_offset = topic_info[partition] offset = current_offset cmd = "/opt/kafka/bin/kafka-simple-consumer-shell.sh --broker-list %s:%s --partition %s --offset %s --max-messages %s --topic %s --property group_id=%s" % (ip, port, partition, offset, max_messages, topic_name, group_name) print cmd output = self.oc.exec_cmd(ns, pod, cmd) print "consumer(%s) of group(%s) receives %s messages at offset(%s) in partition(%s) of topic(%s) " % (pod, group_name, max_messages, offset, partition, topic_name) return 0 def console_consumer(self, topic_name): print "--- console-consumer ---" ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_consumer_pod() for pod in pod_list: cmd = "/opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server %s:%s --topic %s --consumer-property group.id=test1" % (ip, port, topic_name) print cmd output = self.oc.exec_cmd(ns, pod, cmd) return 0 def verify_consumer(self, group_name, topic_name): ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_consumer_pod() for pod in pod_list: cmd = "/opt/kafka/bin/kafka-verifiable-consumer.sh --broker-list %s:%s --group-id %s --topic %s" % (ip, port, group_name, topic_name) print cmd output = self.oc.exec_cmd(ns, pod, cmd) print "consumer(%s) of group(%s) receives messages for topic(%s) " % (pod, group_name, topic_name) return 0 def verify_producer(self, topic_name): ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_consumer_pod() for pod in pod_list: cmd = "kafka-verifiable-producer.sh --broker-list %s:%s --max-messages %s --topic %s" % (ip, port, messages, topic_name) output = self.oc.exec_cmd(ns, pod, cmd) print "producer(%s) send %s messages to topic(%s) " % (pod, messages, topic_name) return 0 def end_to_end_latency(self, topic_name, num_messages): print "--- latency from producer to broker and broker to consumer ---" ns, ip, port = self.find_broker_ip() ns, pod_list = self.find_consumer_pod() if not pod_list: raise Exception("consumer is not existed") pod_info = {} for pod in pod_list: pod_info[pod] = {} cmd = "/opt/kafka/bin/kafka-run-class.sh kafka.tools.EndToEndLatency %s:%s %s %s all 100" % (ip, port, topic_name, num_messages) output = self.oc.exec_cmd(ns, pod, cmd) return output def compute_avg_metrics(self, pod_info_list): total_metric = {} avg_metric = {} pod_info = pod_info_list[0] pod_num = len(pod_info.keys()) pod_name = pod_info.keys()[0] for metric in pod_info[pod_name].keys(): total_metric[metric] = 0 avg_metric[metric] = 0 for pod_info in pod_info_list: for pod in pod_info.keys(): for metric in pod_info[pod].keys(): if pod_info[pod].get(metric): if metric == "record": total_metric[metric] += pod_info[pod][metric] else: total_metric[metric] += pod_info[pod][metric] * pod_info[pod]["record"] num_time = len(pod_info_list) for metric in avg_metric.keys(): if metric == "record": avg_metric[metric] = total_metric[metric] / (pod_num*1.0) / num_time else: avg_metric[metric] = total_metric[metric] / (pod_num*1.0) / total_metric["record"] return avg_metric def calculate_consumer_rate(self): consumer_rate = 0 ns, pod_list = self.find_consumer_pod() start_time = 0 end_time = 0 total_count = 0 comsumer_rate = 0 for pod in pod_list: output = self.oc.log_pod(ns, pod) for line in output.split("\n"): # print line if line and line.find("WARN") == -1: line = json.loads(line) timestamp = line.get("timestamp") count = line.get("count") if count: start_time = timestamp break for pod in pod_list: output = self.oc.log_pod(ns, pod) for line in output.split("\n"): if line and line.find("WARN") == -1: line = json.loads(line) timestamp = line.get("timestamp") count = line.get("count") if count: total_count += count end_time = timestamp time_diff = (end_time - start_time) * 1.0 if time_diff: comsumer_rate = total_count / time_diff * 1000 print "consumer process rate: ", comsumer_rate, total_count, time_diff return consumer_rate def delete_topic_data(self, topic_name): nfs_dir = "/data" data_list = os.listdir(nfs_dir) for broker in data_list: broker_dir = "%s/%s" % (nfs_dir, broker) broker_data_list = os.listdir(broker_dir) for log_dir in broker_data_list: broker_data = "%s/%s" % (broker_dir, log_dir) data_list = os.listdir(broker_data) for data in data_list: if data.find(topic_name) != -1: print data break
class Broker(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Broker, self).__init__() self.namespace = "myproject" self.app_name = "my-cluster-kafka" self.app_type = "statefulset" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): if pod.find("zookeeper") != -1 or pod.find("exporter") != -1: continue for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 if status in ["OOMKilled"]: app_status_oomkilled += 1 print "- Brokers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 count = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): if pod.find("zookeeper") != -1 or pod.find("exporter") != -1: continue cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 count += 1 num_replica = count print "- Brokers: OverLimit %s; OOM: %s" % (app_cpu_overlimit, app_memory_overlimit) output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def calculate_performance(self): num_partition = 0 output = self.describe_topic(topic_name) for line in output.split("\n"): if line and line.find("ReplicationFactor") == -1: if line.find("Isr") != -1: num_partition += 1 print "- Brokers: Partitions %s" % num_partition result = "%s " % num_partition return result def write_logs(self, algo_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() file_name = "%s/%s_broker_metrics" % (traffic_path, algo_name) timestamp = int(time.time()) line = "%s " % (timestamp) line += self.calculate_pod_info() line += self.calculate_overlimit() line += self.calculate_performance() line += "\n" try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write broker logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write broker logs(%s)" % file_name return 0