Beispiel #1
0
def inject_node_scenario(action, node_scenario, node_scenario_object):
    # Get the node scenario configurations
    instance_kill_count = node_scenario.get("instance_kill_count", 1)
    node_name = node_scenario.get("node_name", "")
    label_selector = node_scenario.get("label_selector", "")
    timeout = node_scenario.get("timeout", 120)

    # Get the node to apply the scenario
    node = nodeaction.get_node(node_name, label_selector)

    if action == "node_start_scenario":
        node_scenario_object.node_start_scenario(instance_kill_count, node,
                                                 timeout)
    elif action == "node_stop_scenario":
        node_scenario_object.node_stop_scenario(instance_kill_count, node,
                                                timeout)
    elif action == "node_stop_start_scenario":
        node_scenario_object.node_stop_start_scenario(instance_kill_count,
                                                      node, timeout)
    elif action == "node_termination_scenario":
        node_scenario_object.node_termination_scenario(instance_kill_count,
                                                       node, timeout)
    elif action == "node_reboot_scenario":
        node_scenario_object.node_reboot_scenario(instance_kill_count, node,
                                                  timeout)
    elif action == "stop_kubelet_scenario":
        node_scenario_object.stop_kubelet_scenario(instance_kill_count, node,
                                                   timeout)
    elif action == "stop_start_kubelet_scenario":
        node_scenario_object.stop_start_kubelet_scenario(
            instance_kill_count, node, timeout)
    elif action == "node_crash_scenario":
        node_scenario_object.node_crash_scenario(instance_kill_count, node,
                                                 timeout)
Beispiel #2
0
def inject_node_scenario(action, node_scenario, node_scenario_object):
    generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
    # Get the node scenario configurations
    instance_kill_count = node_scenario.get("instance_kill_count", 1)
    node_name = node_scenario.get("node_name", "")
    label_selector = node_scenario.get("label_selector", "")
    timeout = node_scenario.get("timeout", 120)
    service = node_scenario.get("service", "")
    ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa")
    # Get the node to apply the scenario
    node = nodeaction.get_node(node_name, label_selector)

    if node_general and action not in generic_cloud_scenarios:
        logging.info("Scenario: " + action +
                     " is not set up for generic cloud type, skipping action")
    else:
        if action == "node_start_scenario":
            node_scenario_object.node_start_scenario(instance_kill_count, node,
                                                     timeout)
        elif action == "node_stop_scenario":
            node_scenario_object.node_stop_scenario(instance_kill_count, node,
                                                    timeout)
        elif action == "node_stop_start_scenario":
            node_scenario_object.node_stop_start_scenario(
                instance_kill_count, node, timeout)
        elif action == "node_termination_scenario":
            node_scenario_object.node_termination_scenario(
                instance_kill_count, node, timeout)
        elif action == "node_reboot_scenario":
            node_scenario_object.node_reboot_scenario(instance_kill_count,
                                                      node, timeout)
        elif action == "stop_start_kubelet_scenario":
            node_scenario_object.stop_start_kubelet_scenario(
                instance_kill_count, node, timeout)
        elif action == "stop_kubelet_scenario":
            node_scenario_object.stop_kubelet_scenario(instance_kill_count,
                                                       node, timeout)
        elif action == "node_crash_scenario":
            node_scenario_object.node_crash_scenario(instance_kill_count, node,
                                                     timeout)
        elif action == "stop_start_helper_node_scenario":
            if node_scenario['cloud_type'] != "openstack":
                logging.error("Scenario: " + action + " is not supported for "
                              "cloud type " + node_scenario['cloud_type'] +
                              ", skipping action")
            else:
                if not node_scenario['helper_node_ip']:
                    logging.error("Helper node IP address is not provided")
                    sys.exit(1)
                node_scenario_object.helper_node_stop_start_scenario(
                    instance_kill_count, node_scenario['helper_node_ip'],
                    timeout)
                node_scenario_object.helper_node_service_status(
                    node_scenario['helper_node_ip'], service, ssh_private_key,
                    timeout)
        else:
            logging.info(
                'There is no node action that matches %s, skipping scenario' %
                action)
Beispiel #3
0
def inject_node_scenario(action, node_scenario, node_scenario_object):
    generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
    # Get the node scenario configurations
    instance_kill_count = node_scenario.get("instance_kill_count", 1)
    node_name = node_scenario.get("node_name", "")
    label_selector = node_scenario.get("label_selector", "")
    timeout = node_scenario.get("timeout", 120)
    # Get the node to apply the scenario
    node = nodeaction.get_node(node_name, label_selector)

    if node_general and action not in generic_cloud_scenarios:
        logging.info("Scenario: " + action +
                     " is not set up for generic cloud type, skipping action")
    else:
        if action == "node_start_scenario":
            node_scenario_object.node_start_scenario(instance_kill_count, node,
                                                     timeout)
        elif action == "node_stop_scenario":
            node_scenario_object.node_stop_scenario(instance_kill_count, node,
                                                    timeout)
        elif action == "node_stop_start_scenario":
            node_scenario_object.node_stop_start_scenario(
                instance_kill_count, node, timeout)
        elif action == "node_termination_scenario":
            node_scenario_object.node_termination_scenario(
                instance_kill_count, node, timeout)
        elif action == "node_reboot_scenario":
            node_scenario_object.node_reboot_scenario(instance_kill_count,
                                                      node, timeout)
        elif action == "stop_start_kubelet_scenario":
            node_scenario_object.stop_start_kubelet_scenario(
                instance_kill_count, node, timeout)
        elif action == "stop_kubelet_scenario":
            node_scenario_object.stop_kubelet_scenario(instance_kill_count,
                                                       node, timeout)
        elif action == "node_crash_scenario":
            node_scenario_object.node_crash_scenario(instance_kill_count, node,
                                                     timeout)
Beispiel #4
0
def run(scenarios_list, config, wait_duration):
    failed_post_scenarios = ""
    logging.info("Runing the Network Chaos tests")
    for net_config in scenarios_list:
        with open(net_config, "r") as file:
            param_lst = ["latency", "loss", "bandwidth"]
            test_config = yaml.safe_load(file)
            test_dict = test_config["network_chaos"]
            test_duration = int(test_dict.get("duration", 300))
            test_interface = test_dict.get("interfaces", [])
            test_node = test_dict.get("node_name", "")
            test_node_label = test_dict.get("label_selector",
                                            "node-role.kubernetes.io/master")
            test_execution = test_dict.get("execution", "serial")
            test_instance_count = test_dict.get("instance_count", 1)
            test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
            if test_node:
                node_name_list = test_node.split(",")
            else:
                node_name_list = [test_node]
            nodelst = []
            for single_node_name in node_name_list:
                nodelst.extend(
                    common_node_functions.get_node(single_node_name,
                                                   test_node_label,
                                                   test_instance_count))
            file_loader = FileSystemLoader(
                os.path.abspath(os.path.dirname(__file__)))
            env = Environment(loader=file_loader)
            pod_template = env.get_template("pod.j2")
            test_interface = verify_interface(test_interface, nodelst,
                                              pod_template)
            joblst = []
            egress_lst = [i for i in param_lst if i in test_egress]
            chaos_config = {
                "network_chaos": {
                    "duration": test_duration,
                    "interfaces": test_interface,
                    "node_name": ",".join(nodelst),
                    "execution": test_execution,
                    "instance_count": test_instance_count,
                    "egress": test_egress,
                }
            }
            logging.info("Executing network chaos with config \n %s" %
                         yaml.dump(chaos_config))
            job_template = env.get_template("job.j2")
            try:
                for i in egress_lst:
                    for node in nodelst:
                        exec_cmd = get_egress_cmd(test_execution,
                                                  test_interface,
                                                  i,
                                                  test_dict["egress"],
                                                  duration=test_duration)
                        logging.info("Executing %s on node %s" %
                                     (exec_cmd, node))
                        job_body = yaml.safe_load(
                            job_template.render(jobname=i +
                                                str(hash(node))[:5],
                                                nodename=node,
                                                cmd=exec_cmd))
                        joblst.append(job_body["metadata"]["name"])
                        api_response = kubecli.create_job(job_body)
                        if api_response is None:
                            raise Exception("Error creating job")
                    if test_execution == "serial":
                        logging.info("Waiting for serial job to finish")
                        start_time = int(time.time())
                        wait_for_job(joblst[:], test_duration + 300)
                        logging.info("Waiting for wait_duration %s" %
                                     wait_duration)
                        time.sleep(wait_duration)
                        end_time = int(time.time())
                        cerberus.publish_kraken_status(config,
                                                       failed_post_scenarios,
                                                       start_time, end_time)
                    if test_execution == "parallel":
                        break
                if test_execution == "parallel":
                    logging.info("Waiting for parallel job to finish")
                    start_time = int(time.time())
                    wait_for_job(joblst[:], test_duration + 300)
                    logging.info("Waiting for wait_duration %s" %
                                 wait_duration)
                    time.sleep(wait_duration)
                    end_time = int(time.time())
                    cerberus.publish_kraken_status(config,
                                                   failed_post_scenarios,
                                                   start_time, end_time)
            except Exception as e:
                logging.error("Network Chaos exiting due to Exception %s" % e)
                sys.exit(1)
            finally:
                logging.info("Deleting jobs")
                delete_job(joblst[:])