Esempio n. 1
0
class WriteLog:
    k = Kubectl()
    wait_time = 30
    metric_item_list = ["cpu_value", "memory_value"]
    limit_item_list = ["pod_cpu_limits", "pod_memory_limits"]
    request_item_list = ["pod_cpu_requests", "pod_memory_requests"]
    app_list = {}
    app_name = ""
    namespace = ""
    cpu_limit = 0
    mem_limit = 0
    oc = OC()
    app_type = ""

    def __init__(self):
        pass

    def find_deploymentconfig_by_namespace(self, app_name):
        deployment_name_list = []
        output = ""
        if self.app_type == "deployment":
            output = self.oc.get_deployment(self.namespace)
        elif self.app_type == "deploymentconfig":
            output = self.oc.get_deploymentconfig(self.namespace)
        elif self.app_type == "statefulset":
            output = self.oc.get_statefulset(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                deployment_name = line.split()[0]
                deployment_name_list.append(deployment_name)
        return deployment_name_list

    def find_pod_by_namespace(self, app_name):
        pod_name_list = []
        output = self.oc.get_pods(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                pod_name = line.split()[0]
                if pod_name.find("build") != -1:
                    continue
                pod_name_list.append(pod_name)
        return pod_name_list

    def get_deploymentconfig(self):
        self.app_list = {}
        # print ("---get deployment info---")
        deployment_name_list = self.find_deploymentconfig_by_namespace(
            self.app_name)
        for deployment in deployment_name_list:
            self.app_list[deployment] = {}
        # print self.app_list

    def get_pod_info(self):
        # print ("---get pod info---")
        pod_name_list = self.find_pod_by_namespace(self.app_name)
        for pod_name in pod_name_list:
            for deployment in self.app_list.keys():
                if pod_name.find(deployment) != -1:
                    self.app_list[deployment][pod_name] = {}
        # print self.app_list

    def get_metrics(self):
        # print ("---get metrics---")
        self.kubectl = Kubectl()
        for metric_item in self.metric_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment]:
                    self.app_list[deployment][pod_name][metric_item] = 0
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                output = self.kubectl.top_pod(pod_name, self.namespace)
                for line in output.split("\n"):
                    if line.find(pod_name) != -1:
                        # by kubectl top
                        cpu = int(line.split()[-2].strip("m"))  # mCore
                        memory = int(line.split()[-1].strip("Mi"))  # MB
                        self.app_list[deployment][pod_name]["cpu_value"] = cpu
                        self.app_list[deployment][pod_name][
                            "memory_value"] = memory
        # print self.app_list

    def get_pod_limit(self, pod_name):
        #print ("---get pod limit---")
        cpu_limit = 0
        memory_limit = 0
        cpu_limit_mcore = "0m"
        memory_limit_mb = "0Mi"
        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            try:
                output = json.loads(output)
                if output.get("spec", {}).get("containers",
                                              [])[0].get("resources"):
                    cpu_limit_mcore = output.get("spec", {}).get(
                        "containers",
                        [])[0].get("resources").get("limits").get("cpu", "0m")
                if cpu_limit_mcore and cpu_limit_mcore.find("m") != -1:
                    cpu_limit = float(cpu_limit_mcore.split("m")[0])
                else:
                    cpu_limit = float(cpu_limit_mcore) * 1000
                if output.get("spec", {}).get("containers",
                                              [])[0].get("resources"):
                    memory_limit_mb = output.get("spec", {}).get(
                        "containers",
                        [])[0].get("resources").get("limits").get(
                            "memory", "0Mi")
                if memory_limit_mb and memory_limit_mb.find("M") != -1:
                    memory_limit = float(memory_limit_mb.split("M")[0])
                elif memory_limit_mb and memory_limit_mb.find("G") != -1:
                    memory_limit = float(memory_limit_mb.split("G")[0]) * 1000
            except Exception as e:
                print "failed to get limits: %s" % str(e)
        return cpu_limit, memory_limit

    def get_limits(self):
        output = {}
        for metric_item in self.limit_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment].keys():
                    cpu_limit, memory_limit = self.get_pod_limit(pod_name)
                    if metric_item == "pod_cpu_limits":
                        self.app_list[deployment][pod_name][
                            metric_item] = cpu_limit
                    else:
                        self.app_list[deployment][pod_name][
                            metric_item] = memory_limit

    def get_pod_reason(self, pod_name):
        reason_list = []
        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            output = json.loads(output)
            if output.get("status").get("containerStatuses")[0].get(
                    "lastState"):
                terminated = output.get("status").get("containerStatuses")[
                    0].get("lastState").get("terminated").get("reason")
                reason_list.append(terminated)
        return reason_list

    def get_status(self, is_reason=True):
        output = self.oc.get_pods(self.namespace)
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                for line in output.split("\n"):
                    if line.find(self.app_name) != -1:
                        pod = line.split()[0]
                        if pod == pod_name:
                            status = line.split()[2]
                            restart = int(line.split()[3])
                            self.app_list[deployment][pod_name][
                                "status"] = status
                            self.app_list[deployment][pod_name][
                                "restart"] = restart
                            if is_reason:
                                reason_list = self.get_pod_reason(pod_name)
                                self.app_list[deployment][pod_name][
                                    "reason"] = reason_list

    def get_node_status(self):
        # print "get node status"
        node_info = {}
        output = self.oc.get_nodes()
        for line in output.split("\n"):
            if line.find("NAME") == -1 and line:
                node_name = line.split()[0]
                status = line.split()[1]
                node_info[node_name] = {}
                node_info[node_name]["status"] = status
                usage_output = self.k.top_node(node_name)
                for line in usage_output.split("\n"):
                    if line.find(node_name) != -1:
                        cpu = int(line.split()[1].split("m")[0])
                        memory = int(line.split()[3].split("Mi")[0])
                        node_info[node_name]["cpu"] = cpu
                        node_info[node_name]["memory"] = memory
        # print node_info
        return node_info

    def calculate_overlimit(self, algo, time_count):
        cpu_count = 0
        memory_count = 0
        count = 0
        total_restart = 0
        total_terminated = 0
        data_count = int(time_count * 60 / self.wait_time)
        print "--- %s collect data and write to logs for %d minutes ---" % (
            algo.split("_")[0].upper(), time_count)
        for i in range(data_count):
            start_time = time.time()
            self.get_deploymentconfig()
            self.get_pod_info()
            self.get_limits()
            self.get_metrics()
            self.get_status()
            print "--- %s start to collect data at %d/%d interval(in 30 sec) ---" % (
                algo.split("_")[0], i, data_interval * 2)
            for deployment in self.app_list.keys():
                cpu_limit = 0
                memory_limit = 0
                total_cpu = 0
                total_memory = 0
                total_cpu_limit = 0
                total_memory_limit = 0
                # pod
                for pod in self.app_list[deployment].keys():
                    if self.app_list[deployment][pod].get("pod_cpu_limits"):
                        cpu_limit = self.app_list[deployment][pod][
                            "pod_cpu_limits"]
                        memory_limit = self.app_list[deployment][pod][
                            "pod_memory_limits"]
                    cpu = self.app_list[deployment][pod]["cpu_value"]
                    memory = self.app_list[deployment][pod]["memory_value"]
                    total_cpu += cpu
                    total_memory += memory
                    total_cpu_limit += cpu_limit
                    total_memory_limit += memory_limit
                    if cpu >= cpu_limit and cpu_limit != 0:
                        cpu_count += 1
                    if memory >= memory_limit and memory_limit != 0:
                        memory_count += 1
                    restart = self.app_list[deployment][pod].get("restart", 0)
                    total_restart += restart
                    reason = self.app_list[deployment][pod].get("reason", [])
                    total_terminated += len(reason)
                num_replica = len(self.app_list[deployment].keys())
                print self.app_name, "total_cpu=", total_cpu, "m"
                print self.app_name, "total_memory=", total_memory, "Mi"
                print self.app_name, "current replica=%d" % num_replica
                print self.app_name, "overflow=", cpu_count, "times"
                print self.app_name, "oom=", memory_count, "times"
                print self.app_name, "restart=", total_restart, "times"
                print self.app_name, "terminated=", total_terminated, "times"
                print "\n"
                total_status = 0
                algo_name = "%s-%s" % (self.app_name, algo)
                data = [
                    algo_name, total_cpu, total_cpu_limit, total_memory,
                    total_memory_limit, cpu_count, memory_count, num_replica,
                    restart, total_status
                ]
                self.write_metric(data)
            # print "wait %d seconds" % self.wait_time
            # correct time
            interval = 30
            for j in range(interval):
                end_time = time.time()
                if end_time - start_time >= interval:
                    start_time = start_time + interval
                    break
                time.sleep(5)

    def write_metric(self, data):
        # print "write metrics"
        timestamp = str(int(time.time()))
        data.append(timestamp)
        try:
            pod_name = data[0]
            fn = "./metrics/%s" % pod_name
            with open(fn, "a") as f:
                line = " ".join([str(elem) for elem in data])
                f.write("%s\n" % str(line))
        except Exception as e:
            print "failed to write metrics:%s" % str(e)
Esempio n. 2
0
def check_system_pods_ready(conf, platform):
    kubectl = Kubectl(conf, platform)
    return check_pods_ready(kubectl, namespace="kube-system")
Esempio n. 3
0
def kubectl(conf):
    return Kubectl(conf)
class Zookeeper(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Zookeeper, self).__init__()
        self.namespace = "myproject"
        self.app_name = "my-cluster-zookeeper"
        self.app_type = "statefulset"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0

        for pod in self.w.app_list[self.app_name].keys():
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                    if status in ["OOMKilled"]:
                        app_status_oomkilled += 1

        print "- Zookeepers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
        num_replica = len(self.w.app_list[self.app_name].keys())
        print "- Zookeepers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit,
                                                         app_memory_overlimit)
        output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit,
                               num_replica)
        return output

    def write_logs(self, algo_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        file_name = "%s/%s_zookeeper_metrics" % (traffic_path, algo_name)
        timestamp = int(time.time())
        line = "%s " % (timestamp)
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        line += "\n"

        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write zookeeper logs(%s): %s" % (file_name,
                                                              str(e))
            return -1

        # print "success to write zookeeper logs(%s)" % file_name
        return 0
Esempio n. 5
0
class Consumer(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Consumer, self).__init__()
        self.namespace = "myproject"
        self.app_name = "consumer"
        self.app_type = "deployment"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0
        for pod in self.w.app_list[self.app_name].keys():
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                elif item == "reason":
                    reason_list = self.w.app_list[self.app_name][pod]["reason"]
                    for reason in reason_list:
                        if reason == "OOMKilled":
                            app_status_oomkilled += 1
        print "- Consumers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s OOMKilled %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_oomkilled)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
        num_replica = len(self.w.app_list[self.app_name].keys())
        print "- Consumers: Replica: %s\n" % (num_replica)
        output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit,
                                num_replica)
        return output

    def calculate_performance(self, group_name, topic_name):
        total_lag = 0
        total_log_offset = 0
        total_current_offset = 0
        active_client = 0
        inactive_client = 0
        partition_list = []
        active_client_list = []
        start_time = time.time()
        num_sample = 3
        # print "--------", group_name, topic_name
        for i in range(num_sample):
            output = self.describe_consumer_group(group_name)
            print "==="
            print "%s" % output
            print "==="
            for line in output.split("\n"):
                if line and line.find(topic_name) != -1 and line.find(
                        "Error") == -1:
                    partition = int(line.split()[2])
                    if partition not in partition_list:
                        partition_list.append(partition)
                    current_offset = int(line.split()[3])
                    log_offset = int(line.split()[4])
                    lag = int(line.split()[5])
                    consumer_id = line.split()[6]
                    total_log_offset += log_offset
                    total_current_offset += current_offset
                    total_lag += lag
                    if consumer_id.find("consumer-1") == -1:
                        inactive_client += 1
                    if consumer_id not in active_client_list:
                        active_client_list.append(consumer_id)
            # print i, "total describe lag=", lag, time.time()
        total_lag = total_lag / (num_sample * 1.0)
        total_log_offset = total_log_offset / (num_sample * 1.0)
        total_current_offset = total_current_offset / (num_sample * 1.0)
        inactive_client = inactive_client / (num_sample * 1.0)
        active_client = len(active_client_list)
        print "- Consumers: Log Offset %s;" % total_log_offset, "Current Offset %s;" % total_current_offset, "Lag %s;" % total_lag
        print "- Consumers: Active %s;" % active_client, "Inactive %s" % inactive_client
        print "\n"
        output = "%s %s %s %s %s %s %s %s " % (
            group_name, topic_name, total_lag, active_client, inactive_client,
            total_log_offset, total_current_offset, len(partition_list))
        end_time = time.time()
        #print ">> describe time = ", end_time - start_time
        return output

    def write_logs(self, algo_name, group_name, topic_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        file_name = "%s/%s_consumer_metrics" % (traffic_path, algo_name)
        timestamp = int(time.time())
        line = "%s " % (timestamp)
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        # hungo test - block calculate (per maygy)
        #line += self.calculate_performance(group_name, topic_name)
        line += "\n"

        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write consumer logs(%s): %s" % (file_name, str(e))
            return -1

        # print "success to write consumer logs(%s)" % file_name
        return 0

    def delete_all_consumer_groups(self):
        # delete all consumer groups
        group_list = self.list_consumer_group()
        for group in group_list:
            output = self.delete_consumer_group(group)
Esempio n. 6
0
def inhibit_kured(options):
    Kubectl(options.conf).inhibit_kured()
Esempio n. 7
0
def kubectl(conf, target):
    return Kubectl(conf, target)
class Training:
    k = Kubectl()
    o = OC()
    n = Nginx()

    def __init__(self):
        #self.o.login("admin", "password")
        test = ""

    def get_node_list(self):
        node_list = []
        output = self.o.get_nodes()
        for line in output.split("\n"):
            if line.find("NAME") == -1 and line:
                node_name = line.split()[0]
                node_list.append(node_name)
        return node_list

    def get_node_usage(self):
        # kubectl top node h5-135
        # NAME      CPU(cores)   CPU%      MEMORY(bytes)   MEMORY%
        # h5-135    655m         8%        5703Mi          17%
        node_usage = {}
        node_usage["cpu"] = {}
        node_usage["memory"] = {}
        node_list = self.get_node_list()
        for node in node_list:
            output = self.k.top_node(node)
            for line in output.split("\n"):
                if line.find("NAME") == -1 and line:
                    cpu_usage = int(line.split()[2].split("%")[0])
                    memory_usage = int(line.split()[-1].split("%")[0])
                    node_usage["cpu"][node] = cpu_usage
                    node_usage["memory"][node] = memory_usage
        avg_node_usage = sum(node_usage["cpu"].values()) / len(
            node_usage["cpu"].values())
        max_node_usage = max(node_usage["cpu"].values())
        return max_node_usage, avg_node_usage

    def get_pod_usage(self, app_name, app_namespace):
        pod_usage = {}
        pod_usage["cpu"] = {}
        pod_usage["memory"] = {}
        pod_name_list = find_pod_name(app_name, app_namespace)
        for pod in pod_name_list:
            output = self.k.top_pod(pod, app_namespace)
            for line in output.split("\n"):
                if line.find("NAME") == -1 and line:
                    cpu_usage = int(line.split()[1].split("m")[0])
                    memory_usage = int(line.split()[-1].split("M")[0])
                    pod_usage["cpu"][pod] = cpu_usage
                    pod_usage["memory"][pod] = memory_usage
        avg_pod_usage = sum(pod_usage["cpu"].values()) / len(
            pod_usage["cpu"].values())
        max_pod_usage = max(pod_usage["cpu"].values())
        num_pod = len(pod_name_list)
        return max_pod_usage, avg_pod_usage, num_pod

    def import_traffic(self, ratio, i):
        cmd = "python ./run_ab.py %d %d &" % (0, ratio)
        ret = os.system(cmd)
        return ret

    def get_traffic_info(self):
        dir_name = "./traffic"
        traffic_file_list = os.listdir(dir_name)
        latency_list = []
        for traffic in traffic_file_list:
            traffic_file = "./%s/%s" % (dir_name, traffic)
            if os.path.exists(traffic_file):
                with open(traffic_file, "r") as f:
                    output = f.read()
                    for line in output.split("\n"):
                        if line.find("Connect:  ") != -1:
                            avg_connect_latency = int(line.split()[2])
                            latency_list.append(avg_connect_latency)
        return latency_list

    def collect_usage(self, app_namespace, app_name):
        data = {}
        max_node_usage_list = []
        avg_node_usage_list = []
        max_pod_usage_list = []
        avg_pod_usage_list = []
        start_time = time.time()
        timeout = 120
        print "collect %ds resource usage" % timeout
        while True:
            end_time = time.time()
            if end_time - start_time > timeout:
                print "time is up to %ds..." % timeout
                break
            max_node_usage, avg_node_usage = self.get_node_usage()
            max_pod_usage, avg_pod_usage, num_pod = self.get_pod_usage(
                app_name, app_namespace)
            self.get_traffic_info()
            max_node_usage_list.append(max_node_usage)
            avg_node_usage_list.append(avg_node_usage)
            max_pod_usage_list.append(max_pod_usage)
            avg_pod_usage_list.append(avg_pod_usage)
            time.sleep(5)
        connect_latency_list = self.get_traffic_info()
        max_node_usage = sum(max_node_usage_list) / len(max_node_usage_list)
        avg_node_usage = sum(avg_node_usage_list) / len(avg_node_usage_list)
        max_pod_usage = sum(max_pod_usage_list) / len(max_pod_usage_list)
        avg_pod_usage = sum(avg_pod_usage_list) / len(avg_pod_usage_list)
        avg_connect_latency = sum(connect_latency_list) / len(
            connect_latency_list)
        print "max. node =", max_node_usage, "%"
        print "avg. node =", avg_node_usage, "%"
        print "max. pod = ", max_pod_usage, "m"
        print "avg. pod = ", avg_pod_usage, "m"
        print "avg. connect latency = ", avg_connect_latency, "ms"
        data["max_node"] = max_node_usage
        data["avg_node"] = avg_node_usage
        data["max_pod"] = max_pod_usage
        data["avg_pod"] = avg_pod_usage
        data["avg_connect_latency"] = avg_connect_latency
        return data
class Client(object):
    oc = OC()
    kubectl = Kubectl()
    zookeeper = ""

    def __init__(self):
        pass

    def find_broker_ip(self):
        ns = ""
        ip = ""
        port = ""
        output = self.oc.get_services_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("my-cluster") != -1 and line.find("bootstrap") == -1 and line.find("zookeeper") == -1 and line.find("exporter") == -1:
                    ns = line.split()[0]
                    ip = line.split()[1]
                    #port = line.split()[5].split("/")[0].split(":")[0]
        except Exception as e:
            print "it cannot find broker ip: %s" % str(e)
            return ns, ip, port
        print "find broker ip (%s:%s)" % (ip, port)
        # Hard core port to 9092
        return ns, ip, 9092

    def find_zookeeper_ip(self):
        ns = ""
        ip = ""
        port = ""
        output = self.oc.get_services_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("zookeeper-client") != -1 and line.find("zookeeper-headless") == -1:
                    ns = line.split()[0]
                    ip = line.split()[1]
                    #port = line.split()[5].split("/")[0].split(":")[0]
        except Exception as e:
            print "it cannot find zookeeper ip: %s" % str(e)
            return ns, ip, port
        # print "find zookeeper ip (%s:%s)" % (ip, port)
        # hard code port to 2181
        return ns, ip, 2181

    def find_producer_pod(self):
        ns = ""
        pod_list = []
        output = self.oc.get_pods_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("producer") != -1 and line.find("Running") != -1:
                    ns = line.split()[0]
                    pod = line.split()[1]
                    pod_list.append(pod)
        except Exception as e:
            print "it cannot find producer pod: %s" % str(e)
            return ns, pod_list
        # print "find %s producers in ns (%s)" % (len(pod_list), ns)
        return ns, pod_list

    def find_consumer_pod(self):
        ns = ""
        pod_list = []
        output = self.oc.get_pods_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("consumer") != -1 and line.find("Running") != -1:
                    ns = line.split()[0]
                    pod = line.split()[1]
                    pod_list.append(pod)
        except Exception as e:
            print "it cannot find consumer pod: %s" % str(e)
            return ns, pod_list
        # print "find %s consumers in ns (%s)" % (len(pod_list), ns)
        return ns, pod_list

    def find_zookeeper_pod(self):
        ns = ""
        pod_list = []
        output = self.oc.get_pods_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("zookeeper-client") != -1:
                    ns = line.split()[0]
                    pod = line.split()[1]
                    pod_list.append(pod)
        except Exception as e:
            print "it cannot find consumer pod: %s" % str(e)
            return ns, pod_list
        print "find %s zookeepers in ns (%s)" % (len(pod_list), ns)
        return ns, pod_list

    def find_broker_pod(self):
        ns = ""
        pod_list = []
        output = self.oc.get_pods_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("my-cluster") != -1 and line.find("export") == -1 and line.find("operator") == -1 and line.find("zookeeper") == -1:
                    ns = line.split()[0]
                    pod = line.split()[1]
                    pod_list.append(pod)
        except Exception as e:
            print "it cannot find consumer pod: %s" % str(e)
            return ns, pod_list
        print "find %s brokers in ns (%s)" % (len(pod_list), ns)
        return ns, pod_list

    def list_topic(self):
        topic_list = []
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --list" % (ip, port)
        output = self.oc.exec_cmd(ns, pod, cmd)
        if not output:
            print "there is no topics in %s" % pod
        else:
            for line in output.split("\n"):
                if line:
                    item = line.split()[0]
                    if item and item not in topic_list:
                        topic_list.append(item)
        print "current topics: %s" % ",".join(topic_list)
        return topic_list

    def describe_topic(self, topic_name):
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --describe --topic %s" % (ip, port, topic_name)
        output = self.oc.exec_cmd(ns, pod, cmd)
        return output

    def create_topic(self, topic_name):
        # references: https://blog.csdn.net/u010886217/article/details/83119774
        # --replication-factor<=number of brokers
        # --partitions: 1x or 2x number of brokers
        ns, broker_list = self.find_broker_pod()
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        partition = len(broker_list)
        replication = len(broker_list)
        cmd = "/opt/kafka/bin/kafka-topics.sh --bootstrap-server %s:%s --topic %s --create --partitions %d --replication-factor %d" % (ip, port, topic_name, partition, replication)
        print cmd
        output = self.oc.exec_cmd(ns, pod, cmd)
        print output
        return output

    def delete_topic(self, topic_name):
        print "delete topic:", topic_name
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-topics.sh --delete --bootstrap-server %s:%s --topic %s delete.topic.enable=true" % (ip, port, topic_name)
        output = self.oc.exec_cmd(ns, pod, cmd)
        return output

    def modify_topic(self, topic_name, num_partition):
        print "modify topic:", topic_name
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-topics.sh --alter --bootstrap-server %s:%s --topic %s --partitions %s" % (ip, port, topic_name, num_partition)
        print cmd
        output = self.oc.exec_cmd(ns, pod, cmd)
        print output
        return output

    def list_consumer_group(self):
        # print "--- list consumer group ---"
        group_list = []
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        if not pod_list:
            raise Exception("consumer is not existed")
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --list" % (ip, port)
        output = self.oc.exec_cmd(ns, pod, cmd)
        for group in output.split("\n"):
            if group and group.find("Note") == -1:
                group_list.append(group)
        return group_list

    def describe_consumer_group(self, consumer_group_name):
        # print "describe consumer group: ", consumer_group_name
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --describe --group %s" % (ip, port, consumer_group_name)
        output = self.oc.exec_cmd(ns, pod, cmd)
        return output

    def delete_consumer_group(self, consumer_group_name):
        print "delete consumer group: ", consumer_group_name
        # only delete consumer group by zookeeper
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        pod = pod_list[0]
        cmd = "/opt/kafka/bin/kafka-consumer-groups.sh --bootstrap-server %s:%s --delete --group %s" % (ip, port, consumer_group_name)
        print cmd
        output = self.oc.exec_cmd(ns, pod, cmd)
        return output

    def producer_per_test(self, topic_name, message_count):
        # reference1: https://gist.github.com/ueokande/b96eadd798fff852551b80962862bfb3
        # reference2: https://blog.csdn.net/tom_fans/article/details/75517367
        # print "--- producer_per_test ---"
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_producer_pod()
        if not pod_list:
            raise Exception("producer is not existed")
        pod_info = {}
        record_size = message_size
        for pod in pod_list:
            pod_info[pod] = {}
            cmd = "/opt/kafka/bin/kafka-producer-perf-test.sh --topic %s --num-records %s --record-size %s --throughput 1000000 --producer-props bootstrap.servers=%s:%s" % (topic_name, message_count, record_size, ip, port)  
            print cmd
            output = self.oc.exec_cmd(ns, pod, cmd)
            #print "%s: " % pod, output
            if not output:
                raise Exception("failed to produces messages")
            print "%s produces %s messages for topic %s" % (pod, message_count, topic_name)
            try:
                for line in output.split("\n"):
                    if line and len(line.split()) > 20 and line.find("OpenJDK") == -1:
                        pod_info[pod]["record"] = int(line.split()[0])
                        pod_info[pod]["throughput"] = float(line.split()[3])
                        pod_info[pod]["avg_latency"] = float(line.split()[7])
                        pod_info[pod]["max_latency"] = float(line.split()[11])
                        pod_info[pod]["50th_latency"] = float(line.split()[15])
                        pod_info[pod]["95th_latency"] = float(line.split()[18])
                        pod_info[pod]["99th_latency"] = float(line.split()[21])
                        pod_info[pod]["99.9th_latency"] = float(line.split()[24])
            except Exception as e:
                print "failed to get producer metrics: %s" % str(e)
                pod_info[pod]["record"] = 0
                pod_info[pod]["throughput"] = 0
                pod_info[pod]["avg_latency"] = 0
                pod_info[pod]["max_latency"] = 0
                pod_info[pod]["50th_latency"] = 0
                pod_info[pod]["95th_latency"] = 0
                pod_info[pod]["99th_latency"] = 0
                pod_info[pod]["99.9th_latency"] = 0
        return pod_info
        
    def consumer_per_test(self, topic_name, message_count):
        # reference: https://gist.github.com/ueokande/b96eadd798fff852551b80962862bfb3
        print "--- consumer_per_test ---"
        ns, ip, port = self.find_zookeeper_ip()
        ns, pod_list = self.find_consumer_pod()
        if not pod_list:
            raise Exception("consumer is not existed")
        pod_info = {}
        for pod in pod_list:
            pod_info[pod] = {}
            cmd = "/opt/kafka/bin/kafka-consumer-perf-test.sh --topic %s --messages %s --zookeeper=%s:%s --threads 1" % (topic_name, message_count, ip, port)
            # cmd = "/opt/kafka/bin/kafka-run-class.sh kafka.tools.ConsumerPerformance --topic %s --messages %s --zookeeper=%s:%s --threads 1" % (topic_name, message_count, ip, port)
            print cmd
            output = self.oc.exec_cmd(ns, pod, cmd)
            print "%s receives %s messages for topic %s" % (pod, message_count, topic_name)
            # print output
            for line in output.split("\n"):
                if line and line.find("start.time") == -1:
                    pod_info[pod]["MB.sec"] = float(line.split()[-3].split(",")[0])
                    pod_info[pod]["nMsg.sec"] = float(line.split()[-1])
        return pod_info

    def get_topic_info(self, topic_name):
        num_partition = 0
        output = self.describe_topic(topic_name)
        try:
            num_partition = int(output.split()[1].split(":")[1])
        except Exception as e:
            num_partition = 0
        print "%s has %s partitions" % (topic_name, num_partition)
        return num_partition

    def get_consumer_group_info(self, topic_name, group_name):
        topic_info = {}
        output = self.describe_consumer_group(group_name)
        for line in output.split("\n"):
            if line.find(topic_name) != -1:
                partition = int(line.split()[1])
                current_offset = int(line.split()[2])
                topic_info[partition] = current_offset
        return topic_info

    def simple_consumer_shell(self, topic_name, max_messages):
        # reference: https://segmentfault.com/a/1190000016106045
        # print "--- simple-consumer-shell ---"
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_consumer_pod()
        num_partition = self.get_topic_info(topic_name)
        topic_info = self.get_consumer_group_info(topic_name, group_name)
        if num_partition == 0:
            raise Exception("%s is not existed" % (topic_name))
        for pod in pod_list:
            pod_id = pod_list.index(pod)
            partition = pod_id % num_partition
            current_offset = topic_info[partition]
            offset = current_offset
            cmd = "/opt/kafka/bin/kafka-simple-consumer-shell.sh --broker-list %s:%s --partition %s --offset %s --max-messages %s --topic %s --property group_id=%s" % (ip, port, partition, offset, max_messages, topic_name, group_name)
            print cmd
            output = self.oc.exec_cmd(ns, pod, cmd)
            print "consumer(%s) of group(%s) receives %s messages at offset(%s) in partition(%s) of topic(%s) " % (pod, group_name, max_messages, offset, partition, topic_name)
        return 0

    def console_consumer(self, topic_name):
        print "--- console-consumer ---"
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_consumer_pod()
        for pod in pod_list:
            cmd = "/opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server %s:%s --topic %s --consumer-property group.id=test1" % (ip, port, topic_name)
            print cmd
            output = self.oc.exec_cmd(ns, pod, cmd)
        return 0

    def verify_consumer(self, group_name, topic_name):
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_consumer_pod()
        for pod in pod_list:
            cmd = "/opt/kafka/bin/kafka-verifiable-consumer.sh --broker-list %s:%s --group-id %s --topic %s" % (ip, port, group_name, topic_name)
            print cmd
            output = self.oc.exec_cmd(ns, pod, cmd)
            print "consumer(%s) of group(%s) receives messages for topic(%s) " % (pod, group_name, topic_name)
        return 0

    def verify_producer(self, topic_name):
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_consumer_pod()
        for pod in pod_list:
            cmd = "kafka-verifiable-producer.sh --broker-list %s:%s --max-messages %s --topic %s" % (ip, port, messages, topic_name)
            output = self.oc.exec_cmd(ns, pod, cmd)
            print "producer(%s) send %s messages to topic(%s) " % (pod, messages, topic_name)
        return 0

    def end_to_end_latency(self, topic_name, num_messages):
        print "--- latency from producer to broker and broker to consumer ---"
        ns, ip, port = self.find_broker_ip()
        ns, pod_list = self.find_consumer_pod()
        if not pod_list:
            raise Exception("consumer is not existed")
        pod_info = {}
        for pod in pod_list:
            pod_info[pod] = {}
        cmd = "/opt/kafka/bin/kafka-run-class.sh kafka.tools.EndToEndLatency %s:%s %s %s all 100" % (ip, port, topic_name, num_messages)
        output = self.oc.exec_cmd(ns, pod, cmd)
        return output


    def compute_avg_metrics(self, pod_info_list):
        total_metric = {}
        avg_metric = {}
        pod_info = pod_info_list[0]
        pod_num = len(pod_info.keys())
        pod_name = pod_info.keys()[0]
        for metric in pod_info[pod_name].keys():
            total_metric[metric] = 0
            avg_metric[metric] = 0
    
        for pod_info in pod_info_list:
            for pod in pod_info.keys():
                for metric in pod_info[pod].keys():
                    if pod_info[pod].get(metric):
                        if metric == "record":
                            total_metric[metric] += pod_info[pod][metric]
                        else:
                            total_metric[metric] += pod_info[pod][metric] * pod_info[pod]["record"]

        num_time = len(pod_info_list)
        for metric in avg_metric.keys():
            if metric == "record":
                avg_metric[metric] = total_metric[metric] / (pod_num*1.0) / num_time
            else:
                avg_metric[metric] = total_metric[metric] / (pod_num*1.0) / total_metric["record"]
        return avg_metric

    def calculate_consumer_rate(self):
        consumer_rate = 0
        ns, pod_list = self.find_consumer_pod()
        start_time = 0
        end_time = 0
        total_count = 0
        comsumer_rate = 0
        for pod in pod_list:
            output = self.oc.log_pod(ns, pod)
            for line in output.split("\n"):
                # print line
                if line and line.find("WARN") == -1:
                    line = json.loads(line)
                    timestamp = line.get("timestamp")
                    count = line.get("count")
                    if count:
                        start_time = timestamp
                        break
        for pod in pod_list:
            output = self.oc.log_pod(ns, pod)
            for line in output.split("\n"):
                if line and line.find("WARN") == -1:
                    line = json.loads(line)
                    timestamp = line.get("timestamp")
                    count = line.get("count")
                    if count:
                        total_count += count
                        end_time = timestamp
        time_diff = (end_time - start_time) * 1.0
        if time_diff:
            comsumer_rate = total_count / time_diff * 1000
        print "consumer process rate: ", comsumer_rate, total_count, time_diff
        return consumer_rate


    def delete_topic_data(self, topic_name):
        nfs_dir = "/data"
        data_list = os.listdir(nfs_dir)
        for broker in data_list:
            broker_dir = "%s/%s" % (nfs_dir, broker)
            broker_data_list = os.listdir(broker_dir)
            for log_dir in broker_data_list:
                broker_data = "%s/%s" % (broker_dir, log_dir)
                data_list = os.listdir(broker_data)
                for data in data_list:
                    if data.find(topic_name) != -1:
                        print data
                        break
class Broker(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Broker, self).__init__()
        self.namespace = "myproject"
        self.app_name = "my-cluster-kafka"
        self.app_type = "statefulset"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0

        for pod in self.w.app_list[self.app_name].keys():
            if pod.find("zookeeper") != -1 or pod.find("exporter") != -1:
                continue
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                    if status in ["OOMKilled"]:
                        app_status_oomkilled += 1
        print "- Brokers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        count = 0
        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            if pod.find("zookeeper") != -1 or pod.find("exporter") != -1:
                continue
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
            count += 1
        num_replica = count
        print "- Brokers: OverLimit %s; OOM: %s" % (app_cpu_overlimit,
                                                    app_memory_overlimit)
        output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit,
                                num_replica)
        return output

    def calculate_performance(self):
        num_partition = 0
        output = self.describe_topic(topic_name)
        for line in output.split("\n"):
            if line and line.find("ReplicationFactor") == -1:
                if line.find("Isr") != -1:
                    num_partition += 1
        print "- Brokers: Partitions %s" % num_partition
        result = "%s " % num_partition
        return result

    def write_logs(self, algo_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        file_name = "%s/%s_broker_metrics" % (traffic_path, algo_name)
        timestamp = int(time.time())
        line = "%s " % (timestamp)
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        line += self.calculate_performance()
        line += "\n"

        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write broker logs(%s): %s" % (file_name, str(e))
            return -1

        # print "success to write broker logs(%s)" % file_name
        return 0