def detach_random_instances(asg_names: List[str] = None, tags: List[dict] = None, instance_count: int = None, instance_percent: int = None, decrement_capacity: bool = False, configuration: Configuration = None, secrets: Secrets = None) -> AWSResponse: """ Detaches one or more random instances from an autoscaling group Parameters: One of: asg_names: a list of one or more asg names tags: a list of key/value pair to identify asg(s) by One of: instance_count: integer value of number of instances to detach instance_percent: 1-100, percent of instances to detach decrement_capacity: boolean value to determine if the desired capacity of the autoscaling group should be decreased `tags` are expected as a list of dictionary objects: [ {'Key': 'TagKey1', 'Value': 'TagValue1'}, {'Key': 'TagKey2', 'Value': 'TagValue2'}, ... ] """ validate_asgs(asg_names, tags) if not any([instance_count, instance_percent]): raise FailedActivity('You must specify either "instance_count" or ' '"instance_percent"') client = aws_client('autoscaling', configuration, secrets) if asg_names: asgs = get_asg_by_name(asg_names, client) else: asgs = get_asg_by_tags(tags, client) results = {} for a in asgs['AutoScalingGroups']: # Filter out all instances not currently 'InService' instances = [ e['InstanceId'] for e in a['Instances'] if e['LifecycleState'] == 'InService' ] if instance_percent: instance_count = int( float(len(instances) * float(instance_percent)) / 100) if instance_count > len(instances): raise FailedActivity('You are attempting to detach more instances ' 'than exist on asg %s' % (a['AutoScalingGroupName'])) instances = random.sample(instances, instance_count) response = client.detach_instances( AutoScalingGroupName=a['AutoScalingGroupName'], InstanceIds=sorted(instances), ShouldDecrementDesiredCapacity=decrement_capacity) results.setdefault('Activities', []).extend(response['Activities']) return results
def chaosansible_run( host_list: list = ("localhost"), configuration: Configuration = None, facts: bool = False, become: bool = False, run_once: bool = False, ansible: dict = {}, num_target: str = "all", secrets: Secrets = None, ): """ Run a task through ansible and eventually gather facts from host """ # Check for correct inputs if ansible: if ansible.get("module") is None: raise InvalidActivity("No ansible module defined") if ansible.get("args") is None: raise InvalidActivity("No ansible module args defined") configuration = configuration or {} # Ansible configuration elements module_path = configuration.get("ansible_module_path") become_user = configuration.get("ansible_become_user") ssh_key_path = configuration.get("ansible_ssh_private_key") ansible_user = configuration.get("ansible_user") become_ask_pass = configuration.get("become_ask_pass") ssh_extra_args = configuration.get("ansible_ssh_extra_args") context.CLIARGS = ImmutableDict( connection="smart", verbosity=0, module_path=module_path, forks=10, become=become, become_method="sudo", become_user=become_user, check=False, diff=False, private_key_file=ssh_key_path, remote_user=ansible_user, ssh_extra_args=ssh_extra_args, ) # Update host_list regarding the number of desired target. # Need to generate a new host-list because after being update # and will be used later if num_target != "all": new_host_list = random_host(host_list, int(num_target)) else: new_host_list = host_list[:] # Create an inventory sources = ",".join(new_host_list) if len(new_host_list) == 1: sources += "," loader = DataLoader() inventory = InventoryManager(loader=loader, sources=sources) # Instantiate callback for storing results results_callback = ResultsCollectorJSONCallback() variable_manager = VariableManager(loader=loader, inventory=inventory) if become_ask_pass: passwords = dict(become_pass=become_ask_pass) else: passwords = None # Ansible taskmanager tqm = TaskQueueManager( inventory=inventory, variable_manager=variable_manager, loader=loader, passwords=passwords, stdout_callback=results_callback, run_additional_callbacks=False, ) # Ansible playbook play_source = dict( name="Ansible Play", hosts=new_host_list, gather_facts=facts, tasks=[ dict( name="facts", action=dict(module="debug", args=dict(var="ansible_facts")), ), ], ) # In cas we only want to gather facts if ansible: module = ansible.get("module") args = ansible.get("args") play_source["tasks"].append( dict( name="task", run_once=run_once, action=dict(module=module, args=args), register="shell_out", ) ) # Create an ansible playbook play = Play().load(play_source, variable_manager=variable_manager, loader=loader) # Run it try: result = tqm.run(play) finally: tqm.cleanup() if loader: loader.cleanup_all_tmp_files() # Remove ansible tmpdir shutil.rmtree(C.DEFAULT_LOCAL_TMP, True) if len(results_callback.host_failed) > 0: print("Ansible error(s): ") for error in results_callback.host_failed: print(results_callback.host_failed[error].__dict__) raise FailedActivity("Failed to run ansible task") elif len(results_callback.host_unreachable) > 0: print("Unreachable host(s): ") for error in results_callback.host_unreachable: print(error) raise FailedActivity("At least one target is down") else: results = {} for host, result in results_callback.host_ok.items(): results[host] = result return json.dumps(results)
def network_latency(filter: str = None, duration: int = 60, delay: int = 200, jitter: int = 50, timeout: int = 60, configuration: Configuration = None, secrets: Secrets = None): """ Increases the response time of the virtual machine. Parameters ---------- filter : str, optional Filter the virtual machines. If the filter is omitted all machines in the subscription will be selected as potential chaos candidates. duration : int, optional How long the latency lasts. Defaults to 60 seconds. timeout : int Additional wait time (in seconds) for filling operation to be completed Getting and sending data from/to Azure may take some time so it's not recommended to set this value to less than 30s. Defaults to 60 seconds. delay : int Added delay in ms. Defaults to 200. jitter : int Variance of the delay in ms. Defaults to 50. Examples -------- Some calling examples. Deep dive into the filter syntax: https://docs.microsoft.com/en-us/azure/kusto/query/ >>> network_latency("where resourceGroup=='rg'", configuration=c, secrets=s) Increase the latency of all machines from the group 'rg' >>> network_latency("where resourceGroup=='rg' and name='name'", configuration=c, secrets=s) Increase the latecy of the machine from the group 'rg' having the name 'name' >>> network_latency("where resourceGroup=='rg' | sample 2", configuration=c, secrets=s) Increase the latency of two machines at random from the group 'rg' """ logger.debug( "Start network_latency: configuration='{}', filter='{}'".format( configuration, filter)) machines = __fetch_machines(filter, configuration, secrets) client = __compute_mgmt_client(secrets, configuration) for m in machines: name = m['name'] group = m['resourceGroup'] os_type = __get_os_type(m) if os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "network_latency.sh" else: raise FailedActivity("Cannot run network latency test on OS: %s" % os_type) with open( os.path.join(os.path.dirname(__file__), "scripts", script_name)) as file: script_content = file.read() logger.debug("Script content: {}".format(script_content)) parameters = { 'command_id': command_id, 'script': [script_content], 'parameters': [{ 'name': "duration", 'value': duration }, { 'name': "delay", 'value': delay }, { 'name': "jitter", 'value': jitter }] } logger.debug("Increasing the latency of machine: {}".format(name)) poller = client.virtual_machines.run_command(group, name, parameters) result = poller.result(duration + timeout) # Blocking till executed logger.debug("Execution result: {}".format(poller)) if result: logger.debug(result.value[0].message) # stdout/stderr else: raise FailedActivity( "network_latency operation did not finish on time. " "You may consider increasing timeout setting.")
mocked_init_client.return_value) mocked_command_run.assert_called_with(scale_set['resourceGroup'], instance, parameters=ANY, client=mocked_client) @patch('pdchaosazure.vmss.actions.fetch_vmss', autospec=True) @patch('pdchaosazure.vmss.actions.fetch_instances', autospec=True) @patch('pdchaosazure.vmss.actions.client.init', autospec=True) @patch.object(pdchaosazure.common.compute.command, 'prepare_path', autospec=True) @patch.object(pdchaosazure.common.compute.command, 'run', side_effect=FailedActivity("Activity monkey has failed")) def test_unhappily_fill_disk(mocked_command_run, mocked_command_prepare_path, mocked_init_client, fetch_instances, fetch_vmss): # arrange mocks mocked_command_prepare_path.return_value = '/root/burn/hard' scale_set = vmss_provider.provide_scale_set() instance = vmss_provider.provide_instance() fetch_vmss.return_value = [scale_set] fetch_instances.return_value = [instance] configuration = config_provider.provide_default_config() secrets = secrets_provider.provide_secrets_via_service_principal() mocked_client = MockComputeManagementClient() mocked_init_client.return_value = mocked_client
def drain_nodes(name: str = None, label_selector: str = None, delete_pods_with_local_storage: bool = False, timeout: int = 120, secrets: Secrets = None) -> bool: """ Drain nodes matching the given label or name, so that no pods are scheduled on them any longer and running pods are evicted. It does a similar job to `kubectl drain --ignore-daemonsets` or `kubectl drain --delete-local-data --ignore-daemonsets` if `delete_pods_with_local_storage` is set to `True`. There is no equivalent to the `kubectl drain --force` flag. You probably want to call `uncordon` from in your experiment's rollbacks. """ # first let's make the node unschedulable cordon_node(name=name, label_selector=label_selector, secrets=secrets) api = create_k8s_api_client(secrets) v1 = client.CoreV1Api(api) if name: ret = v1.list_node(field_selector="metadata.name={}".format(name)) logger.debug("Found {d} node named '{s}'".format(d=len(ret.items), s=name)) else: ret = v1.list_node(label_selector=label_selector) logger.debug("Found {d} node(s) labelled '{s}'".format( d=len(ret.items), s=label_selector)) nodes = ret.items if not nodes: raise FailedActivity( "failed to find a node that matches selector {}".format( label_selector)) for node in nodes: node_name = node.metadata.name ret = v1.list_pod_for_all_namespaces( include_uninitialized=True, field_selector="spec.nodeName={}".format(node_name)) logger.debug("Found {d} pods on node '{n}'".format(d=len(ret.items), n=node_name)) if not ret.items: continue # following the drain command from kubectl as best as we can eviction_candidates = [] for pod in ret.items: name = pod.metadata.name phase = pod.status.phase volumes = pod.spec.volumes annotations = pod.metadata.annotations # do not handle mirror pods if annotations and "kubernetes.io/config.mirror" in annotations: logger.debug("Not deleting mirror pod '{}' on " "node '{}'".format(name, node_name)) continue if any(filter(lambda v: v.empty_dir is not None, volumes)): logger.debug("Pod '{}' on node '{}' has a volume made " "of a local storage".format(name, node_name)) if not delete_pods_with_local_storage: logger.debug("Not evicting a pod with local storage") continue logger.debug("Deleting anyway due to flag") eviction_candidates.append(pod) continue if phase in ["Succeeded", "Failed"]: eviction_candidates.append(pod) continue for owner in pod.metadata.owner_references: if owner.controller and owner.kind != "DaemonSet": eviction_candidates.append(pod) break elif owner.kind == "DaemonSet": logger.debug( "Pod '{}' on node '{}' is owned by a DaemonSet. Will " "not evict it".format(name, node_name)) break else: raise FailedActivity( "Pod '{}' on node '{}' is unmanaged, cannot drain this " "node. Delete it manually first?".format(name, node_name)) if not eviction_candidates: logger.debug("No pods to evict. Let's return.") return True logger.debug("Found {} pods to evict".format(len(eviction_candidates))) for pod in eviction_candidates: eviction = client.V1beta1Eviction() eviction.metadata = client.V1ObjectMeta() eviction.metadata.name = pod.metadata.name eviction.metadata.namespace = pod.metadata.namespace eviction.delete_options = client.V1DeleteOptions() try: v1.create_namespaced_pod_eviction(pod.metadata.name, pod.metadata.namespace, body=eviction) except ApiException as x: raise FailedActivity("Failed to evict pod {}: {}".format( pod.metadata.name, x.body)) pods = eviction_candidates[:] started = time.time() while True: logger.debug("Waiting for {} pods to go".format(len(pods))) if time.time() - started > timeout: remaining_pods = "\n".join([p.metadata.name for p in pods]) raise FailedActivity( "Draining nodes did not completed within {}s. " "Remaining pods are:\n{}".format(timeout, remaining_pods)) pending_pods = pods[:] for pod in pods: try: p = v1.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace) # rescheduled elsewhere? if p.metadata.uid != pod.metadata.uid: pending_pods.remove(pod) continue logger.debug("Pod '{}' still around in phase: {}".format( p.metadata.name, p.status.phase)) except ApiException as x: if x.status == 404: # gone... pending_pods.remove(pod) pods = pending_pods[:] if not pods: logger.debug("Evicted all pods we could") break time.sleep(10) return True
def get_metric_data( namespace: str, metric_name: str, dimension_name: str = None, dimension_value: str = None, dimensions: List[Dict[str, str]] = None, statistic: str = None, duration: int = 300, period: int = 60, offset: int = 0, unit: str = None, configuration: Configuration = None, secrets: Secrets = None, ) -> float: """Gets metric data for a given metric in a given time period. This method allows for more data to be retrieved than get_metric_statistics :params namespace: The AWS metric namespace metric_name: The name of the metric to pull data for One of: dimension_name, dimension_value: Required to search for ONE dimension dimensions: Required to search for dimensions combinations Are expected as a list of dictionary objects: [{‘Name’: ‘Dim1’, ‘Value’: ‘Val1’}, {‘Name’: ‘Dim2’, ‘Value’: ‘Val2’}, …] unit: The type of unit desired to be collected statistic: The type of data to return. One of: Average, Sum, Minimum, Maximum, SampleCount period: The window in which to pull datapoints for offset: The time (seconds) to offset the endtime (from now) duration: The time (seconds) to set the start time (from now) """ start_time = datetime.utcnow() - timedelta(seconds=duration) end_time = datetime.utcnow() - timedelta(seconds=offset) if dimensions is None and dimension_name is None and dimension_value is None: raise FailedActivity("You must supply argument for dimensions") args = { "MetricDataQueries": [{ "Id": "m1", "MetricStat": { "Metric": { "Namespace": namespace, "MetricName": metric_name, }, "Period": period, "Stat": statistic, }, "Label": metric_name, }], "StartTime": start_time, "EndTime": end_time, } if dimensions: args["MetricDataQueries"][0]["MetricStat"]["Metric"][ "Dimensions"] = dimensions elif dimension_name and dimension_value: args["MetricDataQueries"][0]["MetricStat"]["Metric"]["Dimensions"] = [{ "Name": dimension_name, "Value": dimension_value }] if unit: args["MetricDataQueries"][0]["MetricStat"]["Unit"] = unit client = aws_client("cloudwatch", configuration, secrets) response = client.get_metric_data(**args)["MetricDataResults"] results = {} for r in response: results.setdefault(r["Label"], []).extend(r["Values"]) result = 0 for k, v in results.items(): if not v: continue if statistic == "Sum": result = sum(v) elif statistic == "Minimum": result = min(v) elif statistic == "Maximum": result = max(v) else: result = mean(v) return round(result, 2)
def get_metric_statistics(namespace: str, metric_name: str, dimension_name: str, dimension_value: str, duration: int = 60, offset: int = 0, statistic: str = None, extended_statistic: str = None, unit: str = None, configuration: Configuration = None, secrets: Secrets = None): """ Get the value of a statistical calculation for a given metric. The period for which the calculation will be performed is specified by a duration and an offset from the current time. Both are specified in seconds. Example: A duration of 60 seconds and an offset of 30 seconds will yield a statistical value based on the time interval between 30 and 90 seconds in the past. More information about input parameters are available in the documentation https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch.html#CloudWatch.Client.get_metric_statistics """ # noqa: E501 client = aws_client("cloudwatch", configuration, secrets) if statistic is None and extended_statistic is None: raise FailedActivity( 'You must supply argument for statistic or extended_statistic') end_time = datetime.utcnow() - timedelta(seconds=offset) start_time = end_time - timedelta(seconds=duration) request_kwargs = { 'Namespace': namespace, 'MetricName': metric_name, 'Dimensions': [{ 'Name': dimension_name, 'Value': dimension_value }], 'StartTime': start_time, 'EndTime': end_time, 'Period': duration } if statistic is not None: request_kwargs['Statistics'] = [statistic] if extended_statistic is not None: request_kwargs['ExtendedStatistics'] = [extended_statistic] if unit is not None: request_kwargs['Unit'] = unit logger.debug('Request arguments: {}'.format(request_kwargs)) response = client.get_metric_statistics(**request_kwargs) datapoints = response['Datapoints'] if len(datapoints) == 0: raise FailedActivity( "No datapoints found for metric {}.{}.{}.{}".format( namespace, metric_name, dimension_name, dimension_value)) datapoint = datapoints[0] logger.debug('Response: {}'.format(response)) try: if statistic is not None: return datapoint[statistic] elif extended_statistic is not None: return datapoint['ExtendedStatistics'][extended_statistic] except Exception as x: raise FailedActivity("Unable to parse response '{}': '{}'".format( response, str(x)))
def load_credentials(secrets: Secrets = None): """ Load GCP credentials from the experiment secrets To authenticate, you need to create a service account manually and either pass the filename or the content of the file into the `secrets` object. So, in the experiment, use one of the followings: ```json { "gcp": { "service_account_file": "/path/to/file.json" } } ``` ```json { "gcp": { "service_account_info": { "type": "service_account", "project_id": "...", "private_key_id": "...", "private_key": "...", "client_email": "...", "client_id": "...", "auth_uri": "https://accounts.google.com/o/oauth2/auth", "token_uri": "https://accounts.google.com/o/oauth2/token", "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...." } } } ``` You would likely want to read value from the environment or Vault if you use the second approach, and avoid storing sensitive data into the experiment itself. Make sure your service account has enough permissions for the activities you wish to conduct (though do not give it too wide permissions either). See: https://developers.google.com/api-client-library/python/auth/service-accounts Also: http://google-auth.readthedocs.io/en/latest/reference/google.oauth2.service_account.html """ # noqa: E501 secrets = secrets or {} service_account_file = secrets.get("service_account_file") service_account_info = secrets.get("service_account_info") credentials = None if service_account_file: service_account_file = os.path.expanduser(service_account_file) if not os.path.exists(service_account_file): raise FailedActivity("GCP account settings not found at {}".format( service_account_file)) logger.debug( "Using GCP credentials from file: {}".format(service_account_file)) credentials = Credentials.from_service_account_file( service_account_file) elif service_account_info and isinstance(service_account_info, dict): logger.debug("Using GCP credentials embedded into secrets") credentials = Credentials.from_service_account_info( service_account_info) else: raise FailedActivity( "missing GCP credentials settings in secrets of this activity") if credentials is not None and credentials.expired: logger.debug("GCP credentials need to be refreshed as they expired") credentials.refresh(httplib2.Http()) if not credentials: raise FailedActivity( "missing a service account to authenticate with the " "Google Cloud Platform") return credentials
def fill_disk(filter: str = None, duration: int = 120, timeout: int = 60, size: int = 1000, path: str = None, configuration: Configuration = None, secrets: Secrets = None): """ Fill the disk with random data. Parameters ---------- filter : str, optional Filter the virtual machines. If the filter is omitted all machines in the subscription will be selected as potential chaos candidates. duration : int, optional Lifetime of the file created. Defaults to 120 seconds. timeout : int Additional wait time (in seconds) for filling operation to be completed. Getting and sending data from/to Azure may take some time so it's not recommended to set this value to less than 30s. Defaults to 60 seconds. size : int Size of the file created on the disk. Defaults to 1GB. path : str, optional The absolute path to write the fill file into. Defaults: C:/burn for Windows clients, /root/burn for Linux clients. Examples -------- Some calling examples. Deep dive into the filter syntax: https://docs.microsoft.com/en-us/azure/kusto/query/ >>> fill_disk("where resourceGroup=='rg'", configuration=c, secrets=s) Fill all machines from the group 'rg' >>> fill_disk("where resourceGroup=='rg' and name='name'", configuration=c, secrets=s) Fill the machine from the group 'rg' having the name 'name' >>> fill_disk("where resourceGroup=='rg' | sample 2", configuration=c, secrets=s) Fill two machines at random from the group 'rg' """ logger.debug( "Start fill_disk: configuration='{}', filter='{}'".format( configuration, filter)) machines = __fetch_machines(filter, configuration, secrets) client = __compute_mgmt_client(secrets, configuration) for m in machines: name = m['name'] group = m['resourceGroup'] os_type = __get_os_type(m) if os_type == OS_WINDOWS: command_id = 'RunPowerShellScript' script_name = "fill_disk.ps1" fill_path = "C:/burn" if path is None else path elif os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "fill_disk.sh" fill_path = "/root/burn" if path is None else path else: raise FailedActivity( "Cannot run disk filling test on OS: %s" % os_type) with open(os.path.join(os.path.dirname(__file__), "../scripts", script_name)) as file: script_content = file.read() logger.debug("Script content: {}".format(script_content)) parameters = { 'command_id': command_id, 'script': [script_content], 'parameters': [ {'name': "duration", 'value': duration}, {'name': "size", 'value': size}, {'name': "path", 'value': fill_path} ] } logger.debug("Filling disk of machine: {}".format(name)) poller = client.virtual_machines.run_command(group, name, parameters) result = poller.result(duration + timeout) # Blocking till executed logger.debug("Execution result: {}".format(poller)) if result: logger.debug(result.value[0].message) # stdout/stderr else: raise FailedActivity( "fill_disk operation did not finish on time. " "You may consider increasing timeout setting.")
def modify_instance_groups_shrink_policy( cluster_id: str, group_id: str, decommission_timeout: int = None, terminate_instances: List[str] = None, protect_instances: List[str] = None, termination_timeout: int = None, configuration: Configuration = None, secrets: Secrets = None, ) -> AWSResponse: """Modify an instance groups shrink operations :param cluster_id: The cluster id :param group_id: The instance group id :param decommission_timeout: Timeout for decommissioning an instance :param terminate_instances: Instance id list to terminate when shrinking :param protect_instances: Instance id list to protect when shrinking :param termination_timeout: Override for list of instances to terminate :param configuration: access values used by actions/probes :param secrets: values that need to be passed on to actions/probes :return: Dict[str, Any] """ if not any([decommission_timeout, terminate_instances, protect_instances]): raise FailedActivity("Must provide at least one of [" '"decommission_timeout", "terminate_instances",' '"protect_instances"]') if termination_timeout and not terminate_instances: raise FailedActivity('Must provide "terminate_instances" when ' 'specifying "termination_timeout"') resize_policy = { **({ "InstancesToTerminate": terminate_instances } if terminate_instances else {}), **({ "InstancesToProtect": protect_instances } if protect_instances else {}), **({ "InstanceTerminationTimeout": termination_timeout } if termination_timeout else {}), } params = { "ClusterId": cluster_id, "InstanceGroups": [{ "InstanceGroupId": group_id, "ShrinkPolicy": { **({ "DecommissionTimeout": decommission_timeout } if decommission_timeout else {}), **({ "InstanceResizePolicy": resize_policy } if resize_policy else {}), }, }], } client = aws_client("emr", configuration, secrets) try: client.modify_instance_groups(**params) return get_instance_group(client, cluster_id, group_id) except ClientError as e: logger.exception(e.response["Error"]["Message"]) raise FailedActivity(e.response["Error"]["Message"])
def __linux_from_default(instance_id: str = None, action: str = None, parameters: Dict[str, Any] = None, configuration: Configuration = None, secrets: Secrets = None) -> AWSResponse: default_timeout = int(parameters['duration']) client = aws_client("ssm", configuration, secrets) if not instance_id: raise FailedActivity("you must specify the instance_id") try: if describe_os_type(instance_id, configuration, secrets) == "windows": os_type = OS_WINDOWS else: os_type = OS_LINUX res_send_command = client.send_command( InstanceIds=[instance_id], DocumentName="AWS-RunShellScript", # =============================================== # TODO if in Windows # DocumentName == 'AWS-RunPowerShellScript' # =============================================== Parameters={ 'commands': [construct_script_content(action, os_type, parameters)] }, ) cmd_id = res_send_command["Command"]["CommandId"] logger.info("ssm run command is sent, id {}".format(cmd_id)) totalwait = 0 interval = default_timeout / 2 while True: res_list = client.list_command_invocations(CommandId=cmd_id, Details=True) try: cp = res_list['CommandInvocations'][0]['CommandPlugins'][0] status = cp['Status'] if status == "InProgress": time.sleep(interval) totalwait += interval interval = interval / 2 if interval > 1 else 1 if totalwait > default_timeout + SSMDEFAULTNETWORKLAGACY: raise FailedActivity( "Script exceeded default timeout {}".format( default_timeout)) continue elif status == "Failed": break elif status == "Success": break else: break except IndexError: time.sleep(1) continue for command_invocation in res_list['CommandInvocations']: for invocation in command_invocation['CommandPlugins']: if invocation['Name'] == 'aws:runShellScript': logger.info("ssm run command status {}".format( invocation['Status'])) logger.info("ssm rum command result \n{}".format( invocation['Output'].rstrip('\n'))) return invocation['Output'].rstrip('\n') except Exception as x: raise FailedActivity( "failed issuing a execute of shell script:\n{}".format(x))
def describe_emr_cluster(client: boto3.client, cluster_id: str) -> AWSResponse: try: return client.describe_cluster(ClusterId=cluster_id) except ClientError as e: logger.exception(e.response["Error"]["Message"]) raise FailedActivity(e.response["Error"]["Message"])
def desired_equals_healthy_tags(tags: List[Dict[str, str]], configuration: Configuration = None, secrets: Secrets = None) -> AWSResponse: """ If desired number matches the number of healthy instances for each of the auto-scaling groups matching tags provided `tags` are expected as: [{ 'Key': 'KeyName', 'Value': 'KeyValue' }, ... ] Returns: bool """ if not tags: raise FailedActivity("Non-empty tags is required") client = aws_client('autoscaling', configuration, secrets) # The following is needed because AWS API does not support filters # on auto-scaling groups # fetch all ASGs using paginator page_iterator = client.get_paginator( 'describe_auto_scaling_groups').paginate( PaginationConfig={'PageSize': 100}) asg_descrs = {'AutoScalingGroups': []} for page in page_iterator: asg_descrs['AutoScalingGroups'].extend(page['AutoScalingGroups']) filter_set = set(map(lambda x: "=".join([x['Key'], x['Value']]), tags)) group_sets = list( map( lambda g: { 'Name': g['AutoScalingGroupName'], 'Tags': set(map(lambda t: "=".join([t['Key'], t['Value']]), g['Tags'])) }, asg_descrs['AutoScalingGroups'])) filtered_groups = [ g['Name'] for g in group_sets if filter_set.issubset(g['Tags']) ] logger.debug("filtered groups: {}".format(filtered_groups)) if filtered_groups: groups_descr = client.describe_auto_scaling_groups( AutoScalingGroupNames=filtered_groups) else: raise FailedActivity( "No auto-scaling groups matched the tags provided") return is_desired_equals_healthy(groups_descr)
def run_experiment(experiment: Experiment, settings: Settings = None) -> Journal: """ Run the given `experiment` method step by step, in the following sequence: steady probe, action, close probe. Activities can be executed in background when they have the `"background"` property set to `true`. In that case, the activity is run in a thread. By the end of runs, those threads block until they are all complete. If the experiment has the `"dry"` property set to `False`, the experiment runs without actually executing the activities. NOTE: Tricky to make a decision whether we should rollback when exiting abnormally (Ctrl-C, SIGTERM...). Afterall, there is a chance we actually cannot afford to rollback properly. Better bailing to a conservative approach. This means we swallow :exc:`KeyboardInterrupt` and :exc:`SystemExit` and do not bubble it back up to the caller. We when were interrupted, we set the `interrupted` flag of the result accordingly to notify the caller this was indeed not terminated properly. """ logger.info("Running experiment: {t}".format(t=experiment["title"])) dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") started_at = time.time() config = load_configuration(experiment.get("configuration", {})) secrets = load_secrets(experiment.get("secrets", {}), config) activity_pool, rollback_pool = get_background_pools(experiment) journal = initialize_run_journal(experiment) try: # this may fail the entire experiment right there if any of the probes # fail or fall out of their tolerance zone try: state = run_steady_state_hypothesis(experiment, config, secrets, dry) journal["steady_states"]["before"] = state if state is not None and not state["steady_state_met"]: p = state["probes"][-1] raise FailedActivity( "Steady state probe '{p}' is not in the given tolerance " "so failing this experiment".format( p=p["activity"]["name"])) except FailedActivity as a: journal["steady_states"]["before"] = state journal["status"] = "failed" logger.fatal(str(a)) else: try: journal["run"] = apply_activities(experiment, config, secrets, activity_pool, dry) except Exception as x: journal["status"] = "aborted" logger.fatal( "Experiment ran into an un expected fatal error, " "aborting now.", exc_info=True) else: try: state = run_steady_state_hypothesis( experiment, config, secrets, dry) journal["steady_states"]["after"] = state if state is not None and not state["steady_state_met"]: p = state["probes"][-1] raise FailedActivity( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except FailedActivity as a: journal["status"] = "failed" logger.fatal(str(a)) except (KeyboardInterrupt, SystemExit): journal["status"] = "interrupted" logger.warn("Received an exit signal, " "leaving without applying rollbacks.") else: journal["status"] = journal["status"] or "completed" journal["rollbacks"] = apply_rollbacks(experiment, config, secrets, rollback_pool, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at logger.info( "Experiment ended with status: {s}".format(s=journal["status"])) return journal
def terminate_random_instances(asg_names: List[str] = None, tags: List[Dict[str, str]] = None, instance_count: int = None, instance_percent: int = None, az: str = None, configuration: Configuration = None, secrets: Secrets = None) -> List[AWSResponse]: """ Terminates one or more random healthy instances associated to an ALB A healthy instance is considered one with a status of 'InService' Parameters: One Of: - asg_names: a list of one or more asg names to target - tags: a list of key/value pairs to identify the asgs by One Of: - instance_count: the number of instances to terminate - instance_percent: the percentage of instances to terminate - az: the availability zone to terminate instances `tags` are expected as a list of dictionary objects: [ {'Key': 'TagKey1', 'Value': 'TagValue1'}, {'Key': 'TagKey2', 'Value': 'TagValue2'}, ... ] """ validate_asgs(asg_names, tags) if not any([instance_count, instance_percent, az]) or all( [instance_percent, instance_count, az]): raise FailedActivity( 'Must specify one of "instance_count", "instance_percent", "az"') client = aws_client('autoscaling', configuration, secrets) if asg_names: asgs = get_asg_by_name(asg_names, client) else: asgs = get_asg_by_tags(tags, client) results = [] for a in asgs['AutoScalingGroups']: # Filter out all instances not currently 'InService' instances = [ e for e in a['Instances'] if e['LifecycleState'] == 'InService' ] if az: instances = [e for e in instances if e['AvailabilityZone'] == az] if not instances: raise FailedActivity( 'No instances found in Availability Zone: {}'.format(az)) else: if instance_percent: instance_count = int( float(len(instances) * float(instance_percent)) / 100) if len(instances) < instance_count: raise FailedActivity( 'Not enough healthy instances in {} to satisfy ' 'termination count {} ({})'.format( a['AutoScalingGroupName'], instance_count, len(instances))) instances = random.sample(instances, instance_count) client = aws_client('ec2', configuration, secrets) try: response = client.terminate_instances( InstanceIds=sorted([e['InstanceId'] for e in instances])) results.append({ 'AutoScalingGroupName': a['AutoScalingGroupName'], 'TerminatingInstances': response['TerminatingInstances'] }) except ClientError as e: raise FailedActivity(e.response['Error']['Message']) return results
def sf_auth(configuration: Configuration, secrets: Secrets) -> ServiceFabricAuth: """ Attempt to load the Service Fabric authentication information from a local configuration file or the passed `configuration` mapping. The latter takes precedence over the local configuration file. If you provide a configuration and secrets dictionary, the returned mapping will be created from their content. For instance, you could have: Configuration mapping (in your experiment file): ```python { "endpoint": "https://XYZ.westus.cloudapp.azure.com:19080", "verify_tls": False, "use_ca": False } ``` Secrets mapping (in your experiment file): ```python { "azure": { "security": "pem", "pem_content": { "type": "env", "key": "AZURE_SERVICE_FABRIC_PEM" } } } ``` In that case, the PEM content will be read from the local environment variable `AZURE_SERVICE_FABRIC_PEM` that you will have populated before hand. The content will be saved by the extension into a temporary file before being used to authenticate. You could also simply have that file ready instead: Secrets mapping (in your experiment file): ```python { "azure": { "security": "pem", "pem_path": "./party-cluster-XYZ-client-cert.pem" } } ``` If you want to load the information from a local Service Fabric config file, set the `config_path` key in the `configuration mapping. Configuration mapping (in your experiment file): ```python { "config_path": "~/.sfctl/config" } ``` The path will be expanded. The authentification file should look like this: ```ini [servicefabric] endpoint = https://XYZ.westus.cloudapp.azure.com:19080 no_verify = true use_ca = false security = pem pem_path = ./party-cluster-XYZ-client-cert.pem ``` No matter the input, the yielded dictionary looks like this: ```python { "endpoint": "https://XYZ.westus.cloudapp.azure.com:19080", "verify": False, "security": { "type": "pem", "path": "./party-cluster-XYZ-client-cert.pem" } } ``` Using this function goes as follows: ```python with auth(configuration, secrets) as info: url = "{}{}".format( info["endpoint"], "/Tools/Chaos/$/Start?api-version=6.0") r = requests.get( url, cert=info["security"]["path"], verify=info["verify"]) """ c = configuration or {} s = secrets or {} config_path = c.get("config_path") endpoint = c.get("endpoint", s.get("endpoint")) if config_path: config_path = os.path.expanduser(config_path) if not os.path.exists(config_path): raise FailedActivity( "Service Fabric configuration file not found at {}".format( config_path)) with open(config_path) as f: parser = configparser.ConfigParser() parser.read_file(f) pem_path = parser.get("servicefabric", "pem_path") if not pem_path: raise FailedActivity("cannot find {}".format(pem_path)) yield { "endpoint": parser.get("servicefabric", "endpoint"), "verify": not (parser.get("servicefabric", "no_verify") != "true"), "security": { "type": parser.get("servicefabric", "security"), "path": pem_path } } elif endpoint: verify_tls = c.get("verify_tls", s.get("verify_tls", True)) use_ca = c.get("use_ca", s.get("use_ca", True)) security_kind = s.get("security", c.get("security", "pem")) pem_path = s.get("pem_path", c.get("pem_path", None)) pem_content = s.get("pem_content", c.get("pem_content", None)) info = { "endpoint": endpoint, "verify": verify_tls, "security": { "type": security_kind, "path": pem_path } } if not pem_path or (not os.path.exists(pem_path) and pem_content): # the file will be deleted when we leave the context block with tempfile.NamedTemporaryFile(mode="w+", encoding='utf-8') as pem_path: pem_path.write(pem_content) pem_path.seek(0) info["security"]["pem_path"] = pem_path.name yield info else: yield info else: raise FailedActivity( "Service Fabric client needs to know how to authenticate")
def __get_os_type(machine): os_type = machine['osType'] if os_type not in (OS_LINUX, OS_WINDOWS): raise FailedActivity("Unknown OS Type: %s" % os_type) return os_type
def __check_secrets(client_id, client_secret, tenant_id): if not client_id or not client_secret or not tenant_id: raise FailedActivity("Client could not find Azure credentials")
def get_metric_statistics( namespace: str, metric_name: str, dimension_name: str = None, dimension_value: str = None, dimensions: List[Dict[str, str]] = None, duration: int = 60, offset: int = 0, statistic: str = None, extended_statistic: str = None, unit: str = None, configuration: Configuration = None, secrets: Secrets = None, ): """ Get the value of a statistical calculation for a given metric. The period for which the calculation will be performed is specified by a duration and an offset from the current time. Both are specified in seconds. Example: A duration of 60 seconds and an offset of 30 seconds will yield a statistical value based on the time interval between 30 and 90 seconds in the past. Is required one of: dimension_name, dimension_value: Required to search for ONE dimension dimensions: Required to search for dimensions combinations Are expected as a list of dictionary objects: [{‘Name’: ‘Dim1’, ‘Value’: ‘Val1’}, {‘Name’: ‘Dim2’, ‘Value’: ‘Val2’}, …] More information about input parameters are available in the documentation https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch.html#CloudWatch.Client.get_metric_statistics """ # noqa: E501 client = aws_client("cloudwatch", configuration, secrets) if statistic is None and extended_statistic is None: raise FailedActivity( "You must supply argument for statistic or extended_statistic") if dimensions is None and dimension_name is None and dimension_value is None: raise FailedActivity("You must supply argument for dimensions") end_time = datetime.utcnow() - timedelta(seconds=offset) start_time = end_time - timedelta(seconds=duration) request_kwargs = { "Namespace": namespace, "MetricName": metric_name, "StartTime": start_time, "EndTime": end_time, "Period": duration, } if dimensions is not None: request_kwargs["Dimensions"] = dimensions else: request_kwargs["Dimensions"] = [{ "Name": dimension_name, "Value": dimension_value }] if statistic is not None: request_kwargs["Statistics"] = [statistic] if extended_statistic is not None: request_kwargs["ExtendedStatistics"] = [extended_statistic] if unit is not None: request_kwargs["Unit"] = unit logger.debug(f"Request arguments: {request_kwargs}") response = client.get_metric_statistics(**request_kwargs) datapoints = response["Datapoints"] if not datapoints: return 0 datapoint = datapoints[0] logger.debug(f"Response: {response}") try: if statistic is not None: return datapoint[statistic] elif extended_statistic is not None: return datapoint["ExtendedStatistics"][extended_statistic] except Exception as x: raise FailedActivity( f"Unable to parse response '{response}': '{str(x)}'")
@patch('chaoscf.api.get_app_by_name', autospec=True, return_value=responses.app) @patch('chaoscf.api.auth', autospec=True) def test_get_bind_by_name(auth, mock_get_app_by_name): auth.return_value = responses.auth_response with requests_mock.mock() as m: m.get( "https://example.com/v2/apps/" + responses.app["metadata"]["guid"] + "/service_bindings", status_code=200, json=responses.binds, complete_qs=True) get_bind_by_name("my-bind", config.config, secrets.secrets, app_name="my-app", org_name="my-org") mock_get_app_by_name.assert_has_calls([call("my-app", config.config, secrets.secrets, org_name="my-org", space_name=None)]) @patch('chaoscf.api.get_app_by_name', autospec=True, side_effect=FailedActivity("error")) @patch('chaoscf.api.auth', autospec=True) def test_get_bind_by_name_when_app_not_found(auth, mock_get_app_by_name): auth.return_value = responses.auth_response with requests_mock.mock() as m: m.get( "https://example.com/v2/apps/" + responses.app["metadata"]["guid"] + "/service_bindings", status_code=200, json=responses.binds, complete_qs=True) with pytest.raises(FailedActivity): get_bind_by_name("my-bind", config.config, secrets.secrets, app_name="my-app", org_name="my-org") mock_get_app_by_name.assert_has_calls([call("my-app", config.config, secrets.secrets, org_name="my-org", space_name=None)])
def stress_cpu(filter: str = None, duration: int = 120, timeout: int = 60, configuration: Configuration = None, secrets: Secrets = None): """ Stress CPU up to 100% at random machines. Parameters ---------- filter : str, optional Filter the virtual machines. If the filter is omitted all machines in the subscription will be selected as potential chaos candidates. duration : int, optional Duration of the stress test (in seconds) that generates high CPU usage. Defaults to 120 seconds. timeout : int Additional wait time (in seconds) for stress operation to be completed. Getting and sending data from/to Azure may take some time so it's not recommended to set this value to less than 30s. Defaults to 60 seconds. Examples -------- Some calling examples. Deep dive into the filter syntax: https://docs.microsoft.com/en-us/azure/kusto/query/ >>> stress_cpu("where resourceGroup=='rg'", configuration=c, secrets=s) Stress all machines from the group 'rg' >>> stress_cpu("where resourceGroup=='rg' and name='name'", configuration=c, secrets=s) Stress the machine from the group 'rg' having the name 'name' >>> stress_cpu("where resourceGroup=='rg' | sample 2", configuration=c, secrets=s) Stress two machines at random from the group 'rg' """ logger.debug("Start stress_cpu: configuration='{}', filter='{}'".format( configuration, filter)) machines = __fetch_machines(filter, configuration, secrets) client = __compute_mgmt_client(secrets, configuration) for m in machines: name = m['name'] group = m['resourceGroup'] os_type = __get_os_type(m) if os_type == OS_WINDOWS: command_id = 'RunPowerShellScript' script_name = "cpu_stress_test.ps1" elif os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "cpu_stress_test.sh" else: raise FailedActivity("Cannot run CPU stress test on OS: %s" % os_type) with open( os.path.join(os.path.dirname(__file__), "scripts", script_name)) as file: script_content = file.read() parameters = { 'command_id': command_id, 'script': [script_content], 'parameters': [{ 'name': "duration", 'value': duration }] } logger.debug("Stressing CPU of machine: {}".format(name)) poller = client.virtual_machines.run_command(group, name, parameters) result = poller.result(duration + timeout) # Blocking till executed if result: logger.debug(result.value[0].message) # stdout/stderr else: raise FailedActivity( "stress_cpu operation did not finish on time. " "You may consider increasing timeout setting.")
def execute_activity(activity: Activity, configuration: Configuration, secrets: Secrets, dry: bool = False) -> Run: """ Low-level wrapper around the actual activity provider call to collect some meta data (like duration, start/end time, exceptions...) during the run. """ ref = activity.get("ref") if ref: activity = lookup_activity(ref) if not activity: raise FailedActivity( "could not find referenced activity '{r}'".format(r=ref)) pauses = activity.get("pauses", {}) pause_before = pauses.get("before") if pause_before: logger.info( "Pausing before next activity for {d}s...".format(d=pause_before)) time.sleep(pause_before) if activity.get("background"): logger.info("{t}: {n} [in background]".format( t=activity["type"].title(), n=activity.get("name"))) else: logger.info("{t}: {n}".format(t=activity["type"].title(), n=activity.get("name"))) start = datetime.utcnow() run = {"activity": activity.copy(), "output": None} result = None try: # only run the activity itself when not in dry-mode if not dry: result = run_activity(activity, configuration, secrets) run["output"] = result run["status"] = "succeeded" if result is not None: logger.debug(" => succeeded with '{r}'".format(r=result)) else: logger.debug(" => succeeded without any result value") except FailedActivity as x: error_msg = str(x) run["status"] = "failed" run["output"] = result run["exception"] = traceback.format_exception(type(x), x, None) logger.error(" => failed: {x}".format(x=error_msg)) finally: # capture the end time before we pause end = datetime.utcnow() run["start"] = start.isoformat() run["end"] = end.isoformat() run["duration"] = (end - start).total_seconds() pause_after = pauses.get("after") if pause_after: logger.info( "Pausing after activity for {d}s...".format(d=pause_after)) time.sleep(pause_after) return run