def _terminate_now(signum: int, frame: FrameType = None) -> None: """ Signal handler for the SIGTERM event. Raises an `InterruptExecution`. """ if signum == signal.SIGTERM: logger.warning("Caught SIGTERM signal, interrupting experiment now") raise InterruptExecution("SIGTERM signal received")
def get_or_raise(value: str = "AZURE_PUBLIC_CLOUD") -> azure_cloud.Cloud: """ Returns the proper Azure cloud object or raises an InterruptException if not found """ if not value: logger.warn("Azure cloud not provided. Using" " AZURE_PUBLIC_CLOUD as default") return azure_cloud.AZURE_PUBLIC_CLOUD cloud = value.strip().upper() if cloud == AZURE_PUBLIC_CLOUD: result = azure_cloud.AZURE_PUBLIC_CLOUD elif cloud == AZURE_CHINA_CLOUD: result = azure_cloud.AZURE_CHINA_CLOUD elif cloud == AZURE_US_GOV_CLOUD: result = azure_cloud.AZURE_US_GOV_CLOUD elif cloud == AZURE_GERMAN_CLOUD: result = azure_cloud.AZURE_GERMAN_CLOUD else: msg = "Invalid Azure cloud '{}'. Please " \ "provide a proper cloud value".format(cloud) logger.info(msg) raise InterruptExecution(msg) return result
def run(resource_group: str, compute: dict, parameters: dict, client: ComputeManagementClient): compute_type = compute.get('type').lower() try: if compute_type == RES_TYPE_VMSS_VM.lower(): poller = client.virtual_machine_scale_set_vms.begin_run_command( resource_group, compute['scale_set'], compute['instance_id'], parameters) elif compute_type == RES_TYPE_VM.lower(): poller = client.virtual_machines.begin_run_command( resource_group, compute['name'], parameters) else: msg = "Running a command for the unknown resource type '{}'".format( compute.get('type')) raise InterruptExecution(msg) except HttpResponseError as e: raise FailedActivity(e.message) result = poller.result() # Blocking till executed if result and result.value: logger.debug(result.value[0].message) # stdout/stderr else: raise FailedActivity( "Operation did not finish properly." " You may consider to increase the timeout in the experiment configuration." )
def is_allowed_to_continue(session: requests.Session, extensions: List[Extension]) -> NoReturn: """ Query the runtime policy and return a boolean indicating if the execution may carry on or not. """ experiment_id = get_experiment_id(extensions) if not experiment_id: return execution_id = get_execution_id(extensions) if not execution_id: return safeguards_url = urls.safeguard(urls.execution( urls.experiment(session.base_url, experiment_id=experiment_id), execution_id=execution_id)) r = session.get(safeguards_url) if r.status_code > 399: return state = r.json() if state.get("allowed", True) is False: safeguards = "\n".join([p["name"] for p in state.get("policies")]) with state_lock: safeguards_state[execution_id] = deepcopy(state.get("policies")) raise InterruptExecution( "The following safe guards disallow this execution from " "continuing:\n{}".format(safeguards) )
def run(resource_group: str, compute: dict, timeout: int, parameters: dict, secrets, configuration): client = init_compute_management_client(secrets, configuration) compute_type = compute.get('type').lower() if compute_type == RES_TYPE_VMSS_VM.lower(): poller = client.virtual_machine_scale_set_vms.run_command( resource_group, compute['scale_set'], compute['instance_id'], parameters) elif compute_type == RES_TYPE_VM.lower(): poller = client.virtual_machines.run_command( resource_group, compute['name'], parameters) else: msg = "Trying to run a command for the unknown resource type '{}'" \ .format(compute.get('type')) raise InterruptExecution(msg) result = poller.result(timeout) # Blocking till executed if result and result.value: logger.debug(result.value[0].message) # stdout/stderr else: raise FailedActivity("Operation did not finish properly." " You may consider increasing timeout setting.")
def before_activity_control(context: Activity, **kwargs): """ Prompt for Yes or No to executing an activity. """ logger.info("About to execute activity: " + context.get("name")) if click.confirm('Do you want to continue?'): logger.info("Continuing: " + context.get("name")) else: raise InterruptExecution("Experiment manually interrupted")
def __authentication_type(secrets: dict) -> str: if 'client_secret' in secrets and secrets['client_secret']: return SERVICE_PRINCIPAL elif 'access_token' in secrets and secrets['access_token']: return AAD_TOKEN else: raise InterruptExecution("Authentication to Azure requires a" " client secret or an access token")
def send_experiment_event(event: str, context: dict, state: dict, settings: Settings): try: with client_session(verify_tls=False, settings=settings) as session: publish_event(event, context, state, settings, session) except Exception as ex: logger.error( "Could not update experiment state in the Proofdock " "cloud. %s", str(ex)) logger.debug(ex) raise InterruptExecution()
def prepare(machine: dict, script: str): os_type = __get_os_type(machine) if os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "{}.sh".format(script) else: if script in UNSUPPORTED_WINDOWS_SCRIPTS: raise InterruptExecution( "'{}' is not supported for os '{}'".format(script, OS_WINDOWS)) command_id = 'RunPowerShellScript' script_name = "{}.ps1".format(script) return command_id, script_name
def __get_credentials(creds: dict) -> ServicePrincipalCredentials: if creds['azure_client_secret'] is not None: credentials = ServicePrincipalCredentials( client_id=creds['azure_client_id'], secret=creds['azure_client_secret'], tenant=creds['azure_tenant_id'], cloud_environment=__get_cloud_env_by_name(creds['azure_cloud'])) elif creds['access_token'] is not None: token = dict(accessToken=creds['access_token']) credentials = AADTokenCredentials(token, creds['azure_client_id']) else: raise InterruptExecution("Authentication to Azure requires a" " client secret or an access token") return credentials
def fetch_instances(vmss, instance_filter: str, client: ComputeManagementClient) -> List[Dict[str, Any]]: if not instance_filter: instance_filter = "sample 1" try: instances = fetch_all_vmss_instances(vmss, client) result = kustolight.filter_resources(instances, instance_filter) except jmespath.exceptions.ParseError: raise InterruptExecution( "'{}' is an invalid query. Please have a look at the documentation." .format(instance_filter)) return result
def __create(secrets: Dict) -> AADMixin: _auth_type = __authentication_type(secrets) if _auth_type == SERVICE_PRINCIPAL: _authentication = ServicePrincipalAuth() elif _auth_type == AAD_TOKEN: _authentication = TokenAuth() try: result = _authentication.create(secrets) return result except AuthenticationError as e: msg = e.inner_exception.error_response.get('error_description') raise InterruptExecution(msg)
def prepare(compute: dict, script: str): os_type = __get_os_type(compute) if os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "{}.sh".format(script) else: if script in UNSUPPORTED_WINDOWS_SCRIPTS: raise InterruptExecution("'{}' is not supported for os '{}'" .format(script, OS_WINDOWS)) command_id = 'RunPowerShellScript' script_name = "{}.ps1".format(script) file_path = os.path.join( os.path.dirname(__file__), "../scripts", script_name) with open(file_path) as file_path: script_content = file_path.read() return command_id, script_content
def __get_os_type(compute): compute_type = compute['type'].lower() if compute_type == RES_TYPE_VMSS_VM.lower(): os_type = compute['storage_profile']['os_disk']['os_type'] elif compute_type == RES_TYPE_VM.lower(): os_type = compute['properties']['storageProfile']['osDisk']['osType'] else: msg = "Trying to run a command for the unknown resource type '{}'" \ .format(compute.get('type')) raise InterruptExecution(msg) if os_type.lower() not in (OS_LINUX, OS_WINDOWS): raise FailedActivity("Unknown OS Type: %s" % os_type) return os_type.lower()
def fetch_resources(input_query: str, resource_type: str, secrets: Secrets, configuration: Configuration): # prepare query _query = __query_from(resource_type, input_query) _query_request = __query_request_from(_query, configuration) # prepare resource graph client try: client = init_resource_graph_client(secrets) resources = client.resources(_query_request) except HttpResponseError as e: msg = e.error.code if e.error.details: for d in e.error.details: msg += ": " + str(d) raise InterruptExecution(msg) # prepare results results = __to_dicts(resources.data) return results
def started(self, experiment: Experiment, journal: Journal) -> None: """ Notify the ChaosIQ service the verification has now started. Provide it with the current journal and status. """ self._start_time = datetime.now() base_endpoint, verify_tls, orgs = get_call_context(self.settings) with client_session(base_endpoint, orgs, verify_tls, self.settings) as session: r = initialize_execution(session, experiment, journal) if r.status_code not in [200, 201]: raise InterruptExecution( "It is possible you are trying to run a verification " "against a team that is not the active team of the `chaos` " # noqa: E501 "session. Please run `chaos team` to switch active team " "then try again. If the problem persists or the team is " "the correct one, please contact the ChaosIQ support.") payload = r.json() execution_id = payload["id"] r = self._make_call("POST", self.verification_run_path, json={ "journal": journal, "status": "started", "experiment_id": get_experiment_id(experiment), "execution_id": execution_id }) error = self.get_error(r) if error or (r is None): logger.error( "Failed to notify verification run was started: {}".format( error)) return payload = r.json() self.run_id = payload["id"] if self.run_id: logger.debug("Verification run '{}' started".format(self.run_id)) set_run_id(self.run_id, experiment)
def fetch_resources(user_query: str, resource_type: str, secrets: Secrets, configuration: Configuration): # prepare query query_request = query.create_request(resource_type, user_query, configuration) # prepare resource graph client try: client = init_client(secrets) resources = client.resources(query_request) except HttpResponseError as e: raise InterruptExecution(e.message) # prepare results results = __to_dicts(resources.data) if not results: raise FailedActivity( "Could not find resources of type '{}' and filter '{}'".format( resource_type, user_query)) return results
def before_experiment_control(context: Experiment, configuration: Configuration = None, secrets: Secrets = None, settings: Settings = None, **kwargs): """ before-control of the experiment's execution Called by the Chaos Toolkit before the experiment's begin but after the configuration and secrets have been loaded. """ if no_upload(settings): return try: logger.info('Creating experiment run in Proofdock...') with client_session(verify_tls=False, settings=settings) as session: execution = push_execution(settings, session) execution_ctx = { 'id': execution.get('id'), 'creation_time': execution.get('creation_time') } add_to_run_context(settings, 'execution', execution_ctx) logger.info("New experiment run with id: '{}' created.".format( execution.get('id'))) except Exception as ex: logger.error('Could not create experiment run in Proofdock cloud. %s', str(ex)) logger.debug(ex) raise InterruptExecution() send_experiment_event(event='before-experiment', context=context, state=None, settings=settings)
def prepare(compute: dict, script_id: str): """Prepare the script :param compute: The instance to be attacked. :param script_id: The script's filename without the filename ending. Is named after the activity name. :return: A tuple of the Command Id and the script content """ os_type = __get_os_type(compute) if os_type == OS_LINUX: command_id = 'RunShellScript' script_name = "{}.sh".format(script_id) else: if script_id in UNSUPPORTED_WINDOWS_SCRIPTS: raise InterruptExecution( "'{}' is not supported for os '{}'".format( script_id, OS_WINDOWS)) command_id = 'RunPowerShellScript' script_name = "{}.ps1".format(script_id) file_path = os.path.join(os.path.dirname(__file__), "../scripts", script_name) with open(file_path) as file_path: script_content = file_path.read() return command_id, script_content
def auth(secrets: Dict) -> ClientSecretCredential: """ Create Azure authentication client from a provided secrets. Service principle and token based auth types are supported. Token based auth do not currently support refresh token functionality. Type of authentication client is determined based on passed secrets. For example, secrets that contains a `client_id`, `client_secret` and `tenant_id` will create ServicePrincipalAuth client ```python { "client_id": "AZURE_CLIENT_ID", "client_secret": "AZURE_CLIENT_SECRET", "tenant_id": "AZURE_TENANT_ID" } ``` If you are not working with Public Global Azure, e.g. China Cloud you can provide `msrestazure.azure_cloud.Cloud` object. If omitted the Public Cloud is taken as default. Please refer to msrestazure.azure_cloud ```python { "client_id": "xxxxxxx", "client_secret": "*******", "tenant_id": "@@@@@@@@@@@", "cloud": "msrestazure.azure_cloud.Cloud" } ``` Using this function goes as follows: ```python with auth(secrets) as cred: subscription_id = configuration.get("subscription_id") resource_client = ResourceManagementClient(cred, subscription_id) compute_client = ComputeManagementClient(cred, subscription_id) ``` Again, if you are not working with Public Azure Cloud, and you set azure_cloud in secret, this will pass one more parameter `base_url` to above function. ```python with auth(secrets) as cred: cloud = cred.get('cloud') client = ComputeManagementClient( credentials=cred, subscription_id=subscription_id, base_url=cloud.endpoints.resource_manager) ``` """ try: credential = ClientSecretCredential( tenant_id=secrets.get('tenant_id'), client_id=secrets.get('client_id'), client_secret=secrets.get('client_secret'), authority=urlparse( secrets.get('cloud').endpoints.active_directory).hostname) except ValueError as e: raise InterruptExecution(str(e)) yield credential
def before_loading_experiment_control(context: str): raise InterruptExecution(f"failed to load: {context}")
def after_activity_control(**kwargs): raise InterruptExecution()
def interrupt_me(): raise InterruptExecution()
def before_activity_control(context: Activity, **kwargs): raise InterruptExecution("let's blow this up")
def force_interrupting_experiment(): raise InterruptExecution()
def aws_client(resource_name: str, configuration: Configuration = None, secrets: Secrets = None): """ Create a boto3 client for the given resource. You may pass the `aws_region` key in the `configuration` object to be explicit about which region you want to use. You may pass `aws_profile_name` value to the `configuration` object so that we load the appropriate profile to converse with the AWS services. In that case, make sure your local `~/aws/credentials` config is properly setup, as per https://boto3.readthedocs.io/en/latest/guide/configuration.html#aws-config-file Also, if you want to assume a role, you should setup that file as per https://boto3.readthedocs.io/en/latest/guide/configuration.html#assume-role-provider as we do not read those settings from the `secrets` object. """ # noqa: E501 configuration = configuration or {} aws_profile_name = configuration.get("aws_profile_name") aws_assume_role_arn = configuration.get("aws_assume_role_arn") params = get_credentials(secrets) region = configuration.get("aws_region") if not region: logger.debug( "The configuration key `aws_region` is not set, looking in the " "environment instead for `AWS_REGION` or `AWS_DEFAULT_REGION`") region = os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION")) if not region: raise InterruptExecution("AWS requires a region to be set!") if region: logger.debug("Using AWS region '{}'".format(region)) params["region_name"] = region if boto3.DEFAULT_SESSION is None: # we must create our own session so that we can populate the profile # name when it is provided. Only create the default session once. boto3.setup_default_session(profile_name=aws_profile_name, **params) if not aws_assume_role_arn: logger.debug( "Client will be using profile '{}' from boto3 session".format( aws_profile_name or "default")) return boto3.client(resource_name, **params) else: logger.debug( "Fetching credentials dynamically assuming role '{}'".format( aws_assume_role_arn)) aws_assume_role_session_name = configuration.get( "aws_assume_role_session_name") if not aws_assume_role_session_name: aws_assume_role_session_name = "ChaosToolkit" logger.debug( "You are missing the `aws_assume_role_session_name` " "configuration key. A unique one was generated: '{}'".format( aws_assume_role_session_name)) client = boto3.client('sts', **params) params = { "RoleArn": aws_assume_role_arn, "RoleSessionName": aws_assume_role_session_name } response = client.assume_role(**params) creds = response['Credentials'] logger.debug( "Temporary credentials will expire on {}".format( creds["Expiration"].isoformat())) params = { "aws_access_key_id": creds['AccessKeyId'], "aws_secret_access_key": creds['SecretAccessKey'], "aws_session_token": creds['SessionToken'] } if region: params["region_name"] = region return boto3.client(resource_name, **params)
def before_loading_experiment_control(context: str): raise InterruptExecution("failed to load: {}".format(context))
def create(self, secrets: Secrets) -> AADMixin: raise InterruptExecution("Not implemented")
def handler(signum, frame): raise InterruptExecution("boom")
def before_activity_control(context: Activity, target_activity_name: str, **kwargs): if context.get("name") == target_activity_name: raise InterruptExecution("let's blow this up")