def node_termination_scenario(self, instance_kill_count, node, timeout): for _ in range(instance_kill_count): try: logging.info("Starting node_termination_scenario injection") instance_id = self.aws.get_instance_id(node) logging.info("Terminating the node %s with instance ID: %s " % (node, instance_id)) self.aws.terminate_instances(instance_id) self.aws.wait_until_terminated(instance_id) for _ in range(timeout): if node not in kubecli.list_nodes(): break time.sleep(1) if node in kubecli.list_nodes(): raise Exception("Node could not be terminated") logging.info("Node with instance ID: %s has been terminated" % (instance_id)) logging.info( "node_termination_scenario has been successfuly injected!") except Exception as e: logging.error( "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)) logging.error("node_termination_scenario injection failed!") sys.exit(1)
def skew_time(scenario): skew_command = "date --set " if scenario['action'] == "skew_date": skewed_date = "00-01-01" skew_command += skewed_date elif scenario['action'] == "skew_time": skewed_time = "01:01:01" skew_command += skewed_time if "node" in scenario["object_type"]: node_names = [] if "object_name" in scenario.keys() and scenario['object_name']: node_names = scenario['object_name'] elif "label_selector" in scenario.keys( ) and scenario['label_selector']: node_names = kubecli.list_nodes(scenario['label_selector']) for node in node_names: node_debug(node, skew_command) logging.info("Reset date/time on node " + str(node)) return "node", node_names elif "pod" in scenario['object_type']: pod_names = [] if "object_name" in scenario.keys() and scenario['object_name']: for name in scenario['object_name']: if "namespace" not in scenario.keys(): logging.error("Need to set namespace when using pod name") sys.exit(1) pod_names.append([name, scenario['namespace']]) elif "label_selector" in scenario.keys( ) and scenario['label_selector']: pod_names = kubecli.get_all_pods(scenario['label_selector']) elif "namespace" in scenario.keys() and scenario['namespace']: pod_names = kubecli.list_pods(scenario['namespace']) counter = 0 for pod_name in pod_names: pod_names[counter] = [pod_name, scenario['namespace']] counter += 1 for pod in pod_names: if len(pod) > 1: pod_exec(pod[0], skew_command, pod[1]) else: pod_exec(pod, skew_command, scenario['namespace']) logging.info("Reset date/time on pod " + str(pod[0])) return "pod", pod_names
def skew_time(scenario): skew_command = "date --set " if scenario["action"] == "skew_date": skewed_date = "00-01-01" skew_command += skewed_date elif scenario["action"] == "skew_time": skewed_time = "01:01:01" skew_command += skewed_time if "node" in scenario["object_type"]: node_names = [] if "object_name" in scenario.keys() and scenario["object_name"]: node_names = scenario["object_name"] elif "label_selector" in scenario.keys() and scenario["label_selector"]: node_names = kubecli.list_nodes(scenario["label_selector"]) for node in node_names: node_debug(node, skew_command) logging.info("Reset date/time on node " + str(node)) return "node", node_names elif "pod" in scenario["object_type"]: container_name = scenario.get("container_name", "") pod_names = [] if "object_name" in scenario.keys() and scenario["object_name"]: for name in scenario["object_name"]: if "namespace" not in scenario.keys(): logging.error("Need to set namespace when using pod name") sys.exit(1) pod_names.append([name, scenario["namespace"]]) elif "namespace" in scenario.keys() and scenario["namespace"]: if "label_selector" not in scenario.keys(): logging.info( "label_selector key not found, querying for all the pods in namespace: %s" % (scenario["namespace"]) ) pod_names = kubecli.list_pods(scenario["namespace"]) else: logging.info( "Querying for the pods matching the %s label_selector in namespace %s" % (scenario["label_selector"], scenario["namespace"]) ) pod_names = kubecli.list_pods(scenario["namespace"], scenario["label_selector"]) counter = 0 for pod_name in pod_names: pod_names[counter] = [pod_name, scenario["namespace"]] counter += 1 elif "label_selector" in scenario.keys() and scenario["label_selector"]: pod_names = kubecli.get_all_pods(scenario["label_selector"]) if len(pod_names) == 0: logging.info("Cannot find pods matching the namespace/label_selector, please check") sys.exit(1) pod_counter = 0 for pod in pod_names: if len(pod) > 1: selected_container_name = get_container_name(pod[0], pod[1], container_name) pod_exec_response = pod_exec(pod[0], skew_command, pod[1], selected_container_name) if pod_exec_response is False: logging.error( "Couldn't reset time on container %s in pod %s in namespace %s" % (selected_container_name, pod[0], pod[1]) ) sys.exit(1) pod_names[pod_counter].append(selected_container_name) else: selected_container_name = get_container_name(pod, scenario["namespace"], container_name) pod_exec_response = pod_exec(pod, skew_command, scenario["namespace"], selected_container_name) if pod_exec_response is False: logging.error( "Couldn't reset time on container %s in pod %s in namespace %s" % (selected_container_name, pod, scenario["namespace"]) ) sys.exit(1) pod_names[pod_counter].append(selected_container_name) logging.info("Reset date/time on pod " + str(pod[0])) pod_counter += 1 return "pod", pod_names
def cluster_shut_down(shut_down_config): runs = shut_down_config["runs"] shut_down_duration = shut_down_config["shut_down_duration"] cloud_type = shut_down_config["cloud_type"] timeout = shut_down_config["timeout"] if cloud_type.lower() == "aws": cloud_object = AWS() elif cloud_type.lower() == "gcp": cloud_object = GCP() elif cloud_type.lower() == "openstack": cloud_object = OPENSTACKCLOUD() elif cloud_type.lower() in ["azure", "az"]: cloud_object = Azure() else: logging.error("Cloud type " + cloud_type + " is not currently supported for cluster shut down") sys.exit(1) nodes = kubecli.list_nodes() node_id = [] for node in nodes: instance_id = cloud_object.get_instance_id(node) node_id.append(instance_id) logging.info("node id list " + str(node_id)) for _ in range(runs): logging.info("Starting cluster_shut_down scenario injection") stopping_nodes = set(node_id) multiprocess_nodes(cloud_object.stop_instances, node_id) stopped_nodes = stopping_nodes.copy() while len(stopping_nodes) > 0: for node in stopping_nodes: if type(node) is tuple: node_status = cloud_object.wait_until_stopped( node[1], node[0], timeout) else: node_status = cloud_object.wait_until_stopped( node, timeout) # Only want to remove node from stopping list when fully stopped/no error if node_status: stopped_nodes.remove(node) stopping_nodes = stopped_nodes.copy() logging.info( "Shutting down the cluster for the specified duration: %s" % (shut_down_duration)) time.sleep(shut_down_duration) logging.info("Restarting the nodes") restarted_nodes = set(node_id) multiprocess_nodes(cloud_object.start_instances, node_id) logging.info("Wait for each node to be running again") not_running_nodes = restarted_nodes.copy() while len(not_running_nodes) > 0: for node in not_running_nodes: if type(node) is tuple: node_status = cloud_object.wait_until_running( node[1], node[0], timeout) else: node_status = cloud_object.wait_until_running( node, timeout) if node_status: restarted_nodes.remove(node) not_running_nodes = restarted_nodes.copy() logging.info( "Waiting for 150s to allow cluster component initialization") time.sleep(150) logging.info("Successfully injected cluster_shut_down scenario!")