def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): for container_scenario_config in scenarios_list: if len(container_scenario_config) > 1: pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1]) else: pre_action_output = "" with open(container_scenario_config[0], "r") as f: cont_scenario_config = yaml.full_load(f) for cont_scenario in cont_scenario_config["scenarios"]: # capture start time start_time = int(time.time()) killed_containers = container_killing_in_pod(cont_scenario) if len(container_scenario_config) > 1: try: failed_post_scenarios = post_actions.check_recovery( kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output ) except Exception as e: logging.error("Failed to run post action checks: %s" % e) sys.exit(1) else: failed_post_scenarios = check_failed_containers( killed_containers, cont_scenario.get("retry_wait", 120) ) logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) # capture end time end_time = int(time.time()) # publish cerberus status cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) logging.info("")
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): try: # Loop to run the scenarios starts here for pod_scenario in scenarios_list: if len(pod_scenario) > 1: pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1]) else: pre_action_output = "" scenario_logs = runcommand.invoke( "powerfulseal autonomous --use-pod-delete-instead-" "of-ssh-kill --policy-file %s --kubeconfig %s " "--no-cloud --inventory-kubernetes --headless" % (pod_scenario[0], kubeconfig_path)) # Display pod scenario logs/actions print(scenario_logs) logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0])) logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) failed_post_scenarios = post_actions.check_recovery( kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output) cerberus.publish_kraken_status(config, failed_post_scenarios) except Exception as e: logging.error("Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e)) return failed_post_scenarios
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): # Loop to run the scenarios starts here for pod_scenario in scenarios_list: if len(pod_scenario) > 1: pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1]) else: pre_action_output = "" try: # capture start time start_time = int(time.time()) input = serialization.load_from_file(pod_scenario) s = pod_plugin.get_schema() input_data: pod_plugin.KillPodConfig = s.unserialize_input( "pod", input) if kubeconfig_path is not None: input_data.kubeconfig_path = kubeconfig_path output_id, output_data = s.call_step("pod", input_data) if output_id == "error": data: pod_plugin.PodErrorOutput = output_data logging.error("Failed to run pod scenario: {}".format( data.error)) else: data: pod_plugin.PodSuccessOutput = output_data for pod in data.pods: print("Deleted pod {} in namespace {}\n".format( pod.pod_name, pod.pod_namespace)) except Exception as e: logging.error( "Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e)) sys.exit(1) logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0])) logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) try: failed_post_scenarios = post_actions.check_recovery( kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output) except Exception as e: logging.error("Failed to run post action checks: %s" % e) sys.exit(1) # capture end time end_time = int(time.time()) # publish cerberus status cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) return failed_post_scenarios
def run(scenarios_list, config, wait_duration): failed_post_scenarios = "" for app_outage_config in scenarios_list: if len(app_outage_config) > 1: with open(app_outage_config, "r") as f: app_outage_config_yaml = yaml.full_load(f) scenario_config = app_outage_config_yaml["application_outage"] pod_selector = scenario_config.get("pod_selector", "{}") traffic_type = scenario_config.get("block", "[Ingress, Egress]") namespace = scenario_config.get("namespace", "") duration = scenario_config.get("duration", 60) start_time = int(time.time()) network_policy_template = """--- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: kraken-deny spec: podSelector: matchLabels: {{ pod_selector }} policyTypes: {{ traffic_type }} """ t = Template(network_policy_template) rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type) # Write the rendered template to a file with open("kraken_network_policy.yaml", "w") as f: f.write(rendered_spec) # Block the traffic by creating network policy logging.info("Creating the network policy") runcommand.invoke( "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)) # wait for the specified duration logging.info( "Waiting for the specified duration in the config: %s" % (duration)) time.sleep(duration) # unblock the traffic by deleting the network policy logging.info("Deleting the network policy") runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace)) logging.info( "End of scenario. Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def run(scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path): for scenario_config in scenarios_list: if len(scenario_config) > 1: pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1]) else: pre_action_output = "" with open(scenario_config[0], "r") as f: scenario_config_yaml = yaml.full_load(f) for scenario in scenario_config_yaml["scenarios"]: scenario_namespace = scenario.get("namespace", "^.*$") scenario_label = scenario.get("label_selector", None) run_count = scenario.get("runs", 1) namespace_action = scenario.get("action", "delete") run_sleep = scenario.get("sleep", 10) wait_time = scenario.get("wait_time", 30) killed_namespaces = [] namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label) start_time = int(time.time()) for i in range(run_count): if len(namespaces) == 0: logging.error( "Couldn't %s %s namespaces, not enough namespaces matching %s with label %s" % (namespace_action, str(run_count), scenario_namespace, str(scenario_label)) ) sys.exit(1) selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)] killed_namespaces.append(selected_namespace) try: runcommand.invoke("oc %s project %s" % (namespace_action, selected_namespace)) logging.info(namespace_action + " on namespace " + str(selected_namespace) + " was successful") except Exception as e: logging.info( namespace_action + " on namespace " + str(selected_namespace) + " was unsuccessful" ) logging.info("Namespace action error: " + str(e)) sys.exit(1) namespaces.remove(selected_namespace) logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep)) time.sleep(run_sleep) logging.info("Waiting for the specified duration: %s" % wait_duration) time.sleep(wait_duration) if len(scenario_config) > 1: try: failed_post_scenarios = post_actions.check_recovery( kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output ) except Exception as e: logging.error("Failed to run post action checks: %s" % e) sys.exit(1) else: failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def run(scenarios_list, config, wait_duration): for time_scenario_config in scenarios_list: with open(time_scenario_config, "r") as f: scenario_config = yaml.full_load(f) for time_scenario in scenario_config["time_scenarios"]: object_type, object_names = skew_time(time_scenario) not_reset = check_date_time(object_type, object_names) if len(not_reset) > 0: logging.info("Object times were not reset") logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) cerberus.publish_kraken_status(config, not_reset)
def run(scenarios_list, config, wait_duration): failed_post_scenarios = [] for shut_down_config in scenarios_list: if len(shut_down_config) > 1: pre_action_output = post_actions.run("", shut_down_config[1]) else: pre_action_output = "" with open(shut_down_config[0], "r") as f: shut_down_config_yaml = yaml.full_load(f) shut_down_config_scenario = shut_down_config_yaml[ "cluster_shut_down_scenario"] cluster_shut_down(shut_down_config_scenario) logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) failed_post_scenarios = post_actions.check_recovery( "", shut_down_config, failed_post_scenarios, pre_action_output) cerberus.publish_kraken_status(config, failed_post_scenarios)
def run(scenarios_list, config, wait_duration): failed_post_scenarios = "" for zone_outage_config in scenarios_list: if len(zone_outage_config) > 1: with open(zone_outage_config, "r") as f: zone_outage_config_yaml = yaml.full_load(f) scenario_config = zone_outage_config_yaml["zone_outage"] vpc_id = scenario_config["vpc_id"] subnet_ids = scenario_config["subnet_id"] duration = scenario_config["duration"] cloud_type = scenario_config["cloud_type"] ids = {} acl_ids_created = [] if cloud_type.lower() == "aws": cloud_object = AWS() else: logging.error( "Cloud type " + cloud_type + " is not currently supported for zone outage scenarios" ) sys.exit(1) start_time = int(time.time()) for subnet_id in subnet_ids: logging.info("Targeting subnet_id") network_association_ids = [] associations, original_acl_id = cloud_object.describe_network_acls( vpc_id, subnet_id) for entry in associations: if entry["SubnetId"] == subnet_id: network_association_ids.append( entry["NetworkAclAssociationId"]) logging.info( "Network association ids associated with the subnet %s: %s" % (subnet_id, network_association_ids)) acl_id = cloud_object.create_default_network_acl(vpc_id) new_association_id = cloud_object.replace_network_acl_association( network_association_ids[0], acl_id) # capture the orginal_acl_id, created_acl_id and new association_id to use during the recovery ids[new_association_id] = original_acl_id acl_ids_created.append(acl_id) # wait for the specified duration logging.info( "Waiting for the specified duration in the config: %s" % (duration)) time.sleep(duration) # replace the applied acl with the previous acl in use for new_association_id, original_acl_id in ids.items(): cloud_object.replace_network_acl_association( new_association_id, original_acl_id) logging.info( "Wating for 60 seconds to make sure the changes are in place" ) time.sleep(60) # delete the network acl created for the run for acl_id in acl_ids_created: cloud_object.delete_network_acl(acl_id) logging.info( "End of scenario. Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def run(scenarios_list, config): failed_post_scenarios = "" for app_config in scenarios_list: if len(app_config) > 1: with open(app_config, "r") as f: config_yaml = yaml.full_load(f) scenario_config = config_yaml["pvc_scenario"] pvc_name = scenario_config.get("pvc_name", "") pod_name = scenario_config.get("pod_name", "") namespace = scenario_config.get("namespace", "") target_fill_percentage = scenario_config.get( "fill_percentage", "50") duration = scenario_config.get("duration", 60) logging.info("""Input params: pvc_name: '%s'\npod_name: '%s'\nnamespace: '%s'\ntarget_fill_percentage: '%s%%'\nduration: '%ss'""" % (str(pvc_name), str(pod_name), str(namespace), str(target_fill_percentage), str(duration))) # Check input params if namespace is None: logging.error( "You must specify the namespace where the PVC is") sys.exit(1) if pvc_name is None and pod_name is None: logging.error( "You must specify the pvc_name or the pod_name") sys.exit(1) if pvc_name and pod_name: logging.info( "pod_name will be ignored, pod_name used will be a retrieved from the pod used in the pvc_name" ) # Get pod name if pvc_name: if pod_name: logging.info( "pod_name '%s' will be overridden from the pod mounted in the PVC" % (str(pod_name))) command = "kubectl describe pvc %s -n %s | grep -E 'Mounted By:|Used By:' | grep -Eo '[^: ]*$'" % ( str(pvc_name), str(namespace), ) logging.debug("Get pod name command:\n %s" % command) pod_name = runcommand.invoke(command, 60).rstrip() logging.info("Pod name: %s" % pod_name) if pod_name == "<none>": logging.error( "Pod associated with %s PVC, on namespace %s, not found" % (str(pvc_name), str(namespace))) sys.exit(1) # Get volume name command = 'kubectl get pods %s -n %s -o json | jq -r ".spec.volumes"' % ( str(pod_name), str(namespace), ) logging.debug("Get mount path command:\n %s" % command) volumes_list = runcommand.invoke(command, 60).rstrip() volumes_list_json = json.loads(volumes_list) for entry in volumes_list_json: if len(entry["persistentVolumeClaim"]["claimName"]) > 0: volume_name = entry["name"] pvc_name = entry["persistentVolumeClaim"]["claimName"] break logging.info("Volume name: %s" % volume_name) logging.info("PVC name: %s" % pvc_name) # Get container name and mount path command = 'kubectl get pods %s -n %s -o json | jq -r ".spec.containers"' % ( str(pod_name), str(namespace), ) logging.debug("Get mount path command:\n %s" % command) volume_mounts_list = runcommand.invoke(command, 60).rstrip().replace( "\n]\n[\n", ",\n") volume_mounts_list_json = json.loads(volume_mounts_list) for entry in volume_mounts_list_json: for vol in entry["volumeMounts"]: if vol["name"] == volume_name: mount_path = vol["mountPath"] container_name = entry["name"] break logging.info("Container path: %s" % container_name) logging.info("Mount path: %s" % mount_path) # Get PVC capacity command = "kubectl describe pvc %s -n %s | grep \"Capacity:\" | grep -Eo '[^: ]*$'" % ( str(pvc_name), str(namespace), ) pvc_capacity = runcommand.invoke( command, 60, ).rstrip() logging.debug("Get PVC capacity command:\n %s" % command) pvc_capacity_kb = toKbytes(pvc_capacity) logging.info("PVC capacity: %s KB" % pvc_capacity_kb) # Get used bytes in PVC command = "df %s -B 1024 | sed 1d | awk -F' ' '{print $3}'" % ( str(mount_path)) logging.debug("Get used bytes in PVC command:\n %s" % command) pvc_used_kb = kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name, "sh") logging.info("PVC used: %s KB" % pvc_used_kb) # Check valid fill percentage current_fill_percentage = float(pvc_used_kb) / float( pvc_capacity_kb) if not (current_fill_percentage * 100 < float(target_fill_percentage) <= 99): logging.error(""" Target fill percentage (%.2f%%) is lower than current fill percentage (%.2f%%) or higher than 99%% """ % (target_fill_percentage, current_fill_percentage * 100)) sys.exit(1) # Calculate file size file_size_kb = int((float(target_fill_percentage / 100) * float(pvc_capacity_kb)) - float(pvc_used_kb)) logging.debug("File size: %s KB" % file_size_kb) file_name = "kraken.tmp" logging.info( "Creating %s file, %s KB size, in pod %s at %s (ns %s)" % (str(file_name), str(file_size_kb), str(pod_name), str(mount_path), str(namespace))) start_time = int(time.time()) # Create temp file in the PVC full_path = "%s/%s" % (str(mount_path), str(file_name)) command = "fallocate -l $((%s*1024)) %s" % (str(file_size_kb), str(full_path)) logging.debug("Create temp file in the PVC command:\n %s" % command) kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name, "sh") # Check if file is created command = "ls -lh %s" % (str(mount_path)) logging.debug("Check file is created command:\n %s" % command) response = kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name, "sh") logging.info("\n" + str(response)) if str(file_name).lower() in str(response).lower(): logging.info("%s file successfully created" % (str(full_path))) else: logging.error("Failed to create tmp file with %s size" % (str(file_size_kb))) remove_temp_file(file_name, full_path, pod_name, namespace, container_name, mount_path, file_size_kb) sys.exit(1) # Wait for the specified duration logging.info( "Waiting for the specified duration in the config: %ss" % (duration)) time.sleep(duration) logging.info("Finish waiting") remove_temp_file(file_name, full_path, pod_name, namespace, container_name, mount_path, file_size_kb) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def run(scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path): for scenario_config in scenarios_list: if len(scenario_config) > 1: pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1]) else: pre_action_output = "" with open(scenario_config[0], "r") as f: scenario_config_yaml = yaml.full_load(f) for scenario in scenario_config_yaml["scenarios"]: scenario_namespace = scenario.get("namespace", "") scenario_label = scenario.get("label_selector", "") if scenario_namespace is not None and scenario_namespace.strip( ) != "": if scenario_label is not None and scenario_label.strip( ) != "": logging.error( "You can only have namespace or label set in your namespace scenario" ) logging.error( "Current scenario config has namespace '%s' and label selector '%s'" % (scenario_namespace, scenario_label)) logging.error( "Please set either namespace to blank ('') or label_selector to blank ('') to continue" ) sys.exit(1) delete_count = scenario.get("delete_count", 1) run_count = scenario.get("runs", 1) run_sleep = scenario.get("sleep", 10) wait_time = scenario.get("wait_time", 30) killed_namespaces = [] start_time = int(time.time()) for i in range(run_count): namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label) for j in range(delete_count): if len(namespaces) == 0: logging.error( "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s" % (str(run_count), scenario_namespace, str(scenario_label))) sys.exit(1) selected_namespace = namespaces[random.randint( 0, len(namespaces) - 1)] killed_namespaces.append(selected_namespace) try: kubecli.delete_namespace(selected_namespace) logging.info( "Delete on namespace %s was successful" % str(selected_namespace)) except Exception as e: logging.info( "Delete on namespace %s was unsuccessful" % str(selected_namespace)) logging.info("Namespace action error: " + str(e)) sys.exit(1) namespaces.remove(selected_namespace) logging.info( "Waiting %s seconds between namespace deletions" % str(run_sleep)) time.sleep(run_sleep) logging.info("Waiting for the specified duration: %s" % wait_duration) time.sleep(wait_duration) if len(scenario_config) > 1: try: failed_post_scenarios = post_actions.check_recovery( kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output) except Exception as e: logging.error( "Failed to run post action checks: %s" % e) sys.exit(1) else: failed_post_scenarios = check_active_namespace( killed_namespaces, wait_time) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
def run(scenarios_list, config, wait_duration): failed_post_scenarios = "" logging.info("Runing the Network Chaos tests") for net_config in scenarios_list: with open(net_config, "r") as file: param_lst = ["latency", "loss", "bandwidth"] test_config = yaml.safe_load(file) test_dict = test_config["network_chaos"] test_duration = int(test_dict.get("duration", 300)) test_interface = test_dict.get("interfaces", []) test_node = test_dict.get("node_name", "") test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master") test_execution = test_dict.get("execution", "serial") test_instance_count = test_dict.get("instance_count", 1) test_egress = test_dict.get("egress", {"bandwidth": "100mbit"}) if test_node: node_name_list = test_node.split(",") else: node_name_list = [test_node] nodelst = [] for single_node_name in node_name_list: nodelst.extend( common_node_functions.get_node(single_node_name, test_node_label, test_instance_count)) file_loader = FileSystemLoader( os.path.abspath(os.path.dirname(__file__))) env = Environment(loader=file_loader) pod_template = env.get_template("pod.j2") test_interface = verify_interface(test_interface, nodelst, pod_template) joblst = [] egress_lst = [i for i in param_lst if i in test_egress] chaos_config = { "network_chaos": { "duration": test_duration, "interfaces": test_interface, "node_name": ",".join(nodelst), "execution": test_execution, "instance_count": test_instance_count, "egress": test_egress, } } logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config)) job_template = env.get_template("job.j2") try: for i in egress_lst: for node in nodelst: exec_cmd = get_egress_cmd(test_execution, test_interface, i, test_dict["egress"], duration=test_duration) logging.info("Executing %s on node %s" % (exec_cmd, node)) job_body = yaml.safe_load( job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)) joblst.append(job_body["metadata"]["name"]) api_response = kubecli.create_job(job_body) if api_response is None: raise Exception("Error creating job") if test_execution == "serial": logging.info("Waiting for serial job to finish") start_time = int(time.time()) wait_for_job(joblst[:], test_duration + 300) logging.info("Waiting for wait_duration %s" % wait_duration) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) if test_execution == "parallel": break if test_execution == "parallel": logging.info("Waiting for parallel job to finish") start_time = int(time.time()) wait_for_job(joblst[:], test_duration + 300) logging.info("Waiting for wait_duration %s" % wait_duration) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) except Exception as e: logging.error("Network Chaos exiting due to Exception %s" % e) sys.exit(1) finally: logging.info("Deleting jobs") delete_job(joblst[:])