def inject_node_scenario(action, node_scenario, node_scenario_object): # Get the node scenario configurations instance_kill_count = node_scenario.get("instance_kill_count", 1) node_name = node_scenario.get("node_name", "") label_selector = node_scenario.get("label_selector", "") timeout = node_scenario.get("timeout", 120) # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) if action == "node_start_scenario": node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) elif action == "node_stop_start_scenario": node_scenario_object.node_stop_start_scenario(instance_kill_count, node, timeout) elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario(instance_kill_count, node, timeout) elif action == "node_reboot_scenario": node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) elif action == "stop_kubelet_scenario": node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario( instance_kill_count, node, timeout) elif action == "node_crash_scenario": node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
def inject_node_scenario(action, node_scenario, node_scenario_object): generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") # Get the node scenario configurations instance_kill_count = node_scenario.get("instance_kill_count", 1) node_name = node_scenario.get("node_name", "") label_selector = node_scenario.get("label_selector", "") timeout = node_scenario.get("timeout", 120) service = node_scenario.get("service", "") ssh_private_key = node_scenario.get("ssh_private_key", "~/.ssh/id_rsa") # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) if node_general and action not in generic_cloud_scenarios: logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action") else: if action == "node_start_scenario": node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) elif action == "node_stop_start_scenario": node_scenario_object.node_stop_start_scenario( instance_kill_count, node, timeout) elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario( instance_kill_count, node, timeout) elif action == "node_reboot_scenario": node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario( instance_kill_count, node, timeout) elif action == "stop_kubelet_scenario": node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) elif action == "node_crash_scenario": node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout) elif action == "stop_start_helper_node_scenario": if node_scenario['cloud_type'] != "openstack": logging.error("Scenario: " + action + " is not supported for " "cloud type " + node_scenario['cloud_type'] + ", skipping action") else: if not node_scenario['helper_node_ip']: logging.error("Helper node IP address is not provided") sys.exit(1) node_scenario_object.helper_node_stop_start_scenario( instance_kill_count, node_scenario['helper_node_ip'], timeout) node_scenario_object.helper_node_service_status( node_scenario['helper_node_ip'], service, ssh_private_key, timeout) else: logging.info( 'There is no node action that matches %s, skipping scenario' % action)
def inject_node_scenario(action, node_scenario, node_scenario_object): generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") # Get the node scenario configurations instance_kill_count = node_scenario.get("instance_kill_count", 1) node_name = node_scenario.get("node_name", "") label_selector = node_scenario.get("label_selector", "") timeout = node_scenario.get("timeout", 120) # Get the node to apply the scenario node = nodeaction.get_node(node_name, label_selector) if node_general and action not in generic_cloud_scenarios: logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action") else: if action == "node_start_scenario": node_scenario_object.node_start_scenario(instance_kill_count, node, timeout) elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario(instance_kill_count, node, timeout) elif action == "node_stop_start_scenario": node_scenario_object.node_stop_start_scenario( instance_kill_count, node, timeout) elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario( instance_kill_count, node, timeout) elif action == "node_reboot_scenario": node_scenario_object.node_reboot_scenario(instance_kill_count, node, timeout) elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario( instance_kill_count, node, timeout) elif action == "stop_kubelet_scenario": node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout) elif action == "node_crash_scenario": node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
def run(scenarios_list, config, wait_duration): failed_post_scenarios = "" logging.info("Runing the Network Chaos tests") for net_config in scenarios_list: with open(net_config, "r") as file: param_lst = ["latency", "loss", "bandwidth"] test_config = yaml.safe_load(file) test_dict = test_config["network_chaos"] test_duration = int(test_dict.get("duration", 300)) test_interface = test_dict.get("interfaces", []) test_node = test_dict.get("node_name", "") test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master") test_execution = test_dict.get("execution", "serial") test_instance_count = test_dict.get("instance_count", 1) test_egress = test_dict.get("egress", {"bandwidth": "100mbit"}) if test_node: node_name_list = test_node.split(",") else: node_name_list = [test_node] nodelst = [] for single_node_name in node_name_list: nodelst.extend( common_node_functions.get_node(single_node_name, test_node_label, test_instance_count)) file_loader = FileSystemLoader( os.path.abspath(os.path.dirname(__file__))) env = Environment(loader=file_loader) pod_template = env.get_template("pod.j2") test_interface = verify_interface(test_interface, nodelst, pod_template) joblst = [] egress_lst = [i for i in param_lst if i in test_egress] chaos_config = { "network_chaos": { "duration": test_duration, "interfaces": test_interface, "node_name": ",".join(nodelst), "execution": test_execution, "instance_count": test_instance_count, "egress": test_egress, } } logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config)) job_template = env.get_template("job.j2") try: for i in egress_lst: for node in nodelst: exec_cmd = get_egress_cmd(test_execution, test_interface, i, test_dict["egress"], duration=test_duration) logging.info("Executing %s on node %s" % (exec_cmd, node)) job_body = yaml.safe_load( job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)) joblst.append(job_body["metadata"]["name"]) api_response = kubecli.create_job(job_body) if api_response is None: raise Exception("Error creating job") if test_execution == "serial": logging.info("Waiting for serial job to finish") start_time = int(time.time()) wait_for_job(joblst[:], test_duration + 300) logging.info("Waiting for wait_duration %s" % wait_duration) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) if test_execution == "parallel": break if test_execution == "parallel": logging.info("Waiting for parallel job to finish") start_time = int(time.time()) wait_for_job(joblst[:], test_duration + 300) logging.info("Waiting for wait_duration %s" % wait_duration) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) except Exception as e: logging.error("Network Chaos exiting due to Exception %s" % e) sys.exit(1) finally: logging.info("Deleting jobs") delete_job(joblst[:])