def get_background_pools(experiment: Experiment) -> ThreadPoolExecutor: """ Create a pool for background activities. The pool is as big as the number of declared background activities. If none are declared, returned `None`. """ method = experiment.get("method", []) rollbacks = experiment.get("rollbacks", []) activity_background_count = 0 for activity in method: if activity and activity.get("background"): activity_background_count = activity_background_count + 1 activity_pool = None if activity_background_count: logger.debug("{c} activities will be run in the background".format( c=activity_background_count)) activity_pool = ThreadPoolExecutor(activity_background_count) rollback_background_pool = 0 for activity in rollbacks: if activity and activity.get("background"): rollback_background_pool = rollback_background_pool + 1 rollback_pool = None if rollback_background_pool: logger.debug("{c} rollbacks will be run in the background".format( c=rollback_background_pool)) rollback_pool = ThreadPoolExecutor(rollback_background_pool) return activity_pool, rollback_pool
def get_all_activities(experiment: Experiment) -> List[Activity]: activities = [] activities.extend( experiment.get("steady-state-hypothesis", {}).get("probes", [])) activities.extend(experiment.get("method", [])) activities.extend(experiment.get("rollbacks", [])) return activities
def get_controls(experiment: Experiment) -> List[Control]: controls = [] controls.extend(experiment.get("controls", [])) controls.extend(experiment.get("steady-state-hypothesis", {}).get("controls", [])) for activity in get_all_activities(experiment): controls.extend(activity.get("controls", [])) return controls
def configure(self, experiment: Experiment, settings: Settings, experiment_vars: Dict[str, Any]) -> None: config_vars, secret_vars = experiment_vars or (None, None) self.settings = settings if settings is not None else \ get_loaded_settings() self.config = load_configuration(experiment.get("configuration", {}), config_vars) self.secrets = load_secrets(experiment.get("secrets", {}), self.config, secret_vars)
def cache_activities(experiment: Experiment) -> List[Activity]: """ Cache all activities into a map so we can quickly lookup ref. """ logger.debug("Building activity cache...") lot = experiment.get("method", []) + \ experiment.get("steady-state-hypothesis", {}).get("probes", []) for activity in lot: name = activity.get("name") if name: _cache[name] = activity logger.debug("Cached {d} activities".format(d=len(_cache)))
def run_rollbacks(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, dry: bool = False) -> Iterator[Run]: """ Run all rollbacks declared in the experiment in their order. Wait for each rollback activity to complete before to the next unless the activity is declared with the `background` flag. """ rollbacks = experiment.get("rollbacks", []) if not rollbacks: logger.info("No declared rollbacks, let's move on.") for activity in rollbacks: logger.info("Rollback: {t}".format(t=activity.get("name"))) if activity.get("background"): logger.debug("rollback activity will run in the background") yield pool.submit(execute_activity, experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) else: yield execute_activity(experiment, activity, configuration=configuration, secrets=secrets, dry=dry)
def has_steady_state_hypothesis_with_probes(experiment: Experiment) -> bool: steady_state_hypothesis = experiment.get("steady-state-hypothesis") if steady_state_hypothesis: probes = steady_state_hypothesis.get("probes") if probes: return len(probes) > 0 return False
def ensure_hypothesis_is_valid(experiment: Experiment): """ Validates that the steady state hypothesis entry has the expected schema or raises :exc:`InvalidExperiment` or :exc:`InvalidProbe`. """ hypo = experiment.get("steady-state-hypothesis") if hypo is None: return if not hypo.get("title"): raise InvalidExperiment("hypothesis requires a title") probes = hypo.get("probes") if probes: for probe in probes: ensure_activity_is_valid(probe) if "tolerance" not in probe: raise InvalidActivity( "hypothesis probe must have a tolerance entry") if not isinstance(probe["tolerance"], ( bool, int, list, str, dict)): raise InvalidActivity( "hypothesis probe tolerance must either be an integer, " "a string, a boolean or a pair of values for boundaries. " "It can also be a dictionary which is a probe activity " "definition that takes an argument called `value` with " "the value of the probe itself to be validated") if isinstance(probe, dict): ensure_activity_is_valid(probe)
def run_activities(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, dry: bool = False) -> Iterator[Run]: """ Internal generator that iterates over all activities and execute them. Yields either the result of the run or a :class:`concurrent.futures.Future` if the activity was set to run in the `background`. """ method = experiment.get("method", []) if not method: logger.info("No declared activities, let's move on.") for activity in method: if activity.get("background"): logger.debug("activity will run in the background") yield pool.submit(execute_activity, experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) else: yield execute_activity(experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry)
def get_all_activities_in_experiment(experiment: Experiment) -> List[Activity]: """ Handy function to return all activities from a given experiment. Useful when you need to iterate over all the activities. """ activities = [] hypo = experiment.get("steady-state-hypothesis") if hypo: activities.extend(hypo.get("probes", [])) method = experiment.get("method", []) activities.extend(method) rollbacks = experiment.get("rollbacks", []) activities.extend(rollbacks) return activities
def run_steady_state_hypothesis(experiment: Experiment, configuration: Configuration, secrets: Secrets, dry: bool = False): """ Run all probes in the hypothesis and fail the experiment as soon as any of the probe fails or is outside the tolerance zone. """ state = { "steady_state_met": None, "probes": [] } hypo = experiment.get("steady-state-hypothesis") if not hypo: logger.info( "No steady state hypothesis defined. That's ok, just exploring.") return logger.info("Steady state hypothesis: {h}".format(h=hypo.get("title"))) with controls(level="hypothesis", experiment=experiment, context=hypo, configuration=configuration, secrets=secrets) as control: probes = hypo.get("probes", []) control.with_state(state) for activity in probes: run = execute_activity( experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) state["probes"].append(run) if run["status"] == "failed": run["tolerance_met"] = False state["steady_state_met"] = False logger.warn("Probe terminated unexpectedly, " "so its tolerance could not be validated") return state run["tolerance_met"] = True if dry: # do not check for tolerance when dry mode is on continue tolerance = activity.get("tolerance") logger.debug("allowed tolerance is {t}".format(t=str(tolerance))) checked = within_tolerance( tolerance, run["output"], configuration=configuration, secrets=secrets) if not checked: run["tolerance_met"] = False state["steady_state_met"] = False return state state["steady_state_met"] = True logger.info("Steady state hypothesis is met!") return state
def set_extension_value(experiment: Experiment, key: str, value: Any): extensions = experiment.setdefault("extensions", []) for extension in extensions: ext_name = extension.get("name") if ext_name == "chaosiq": extension[key] = str(value) break else: extensions.append({"name": "chaosiq", key: value})
def del_extension_value(experiment: Experiment, key: str, silent: bool = True): extensions = experiment.setdefault("extensions", []) for extension in extensions: ext_name = extension.get("name") if ext_name == "chaosiq": try: del extension[key] except KeyError: if not silent: raise
def set_experiment_id(experiment_id: str, experiment: Experiment) -> NoReturn: extensions = experiment.setdefault("extensions", []) for extension in extensions: if extension["name"] == "chaosiq": extension["experiment_id"] = experiment_id break else: extensions.append({ "name": "chaosiq", "experiment_id": experiment_id })
def validate_extensions(experiment: Experiment): """ Validate that extensions respect the specification. """ extensions = experiment.get("extensions") if not extensions: return for ext in extensions: ext_name = ext.get("name") if not ext_name or not ext_name.strip(): raise InvalidExperiment("All extensions require a non-empty name")
def before_experiment_control(context: Experiment, **kwargs): """ Create a tracing span when the experiment's execution begins. """ tracer = local.tracer name = context.get("title") span = tracer.start_span(name) tracer.experiment_span = span span.set_tag('type', 'experiment') tags = context.get("tags") if tags: span.set_tag('target', ', '.join(tags)) contributions = context.get("contributions") if contributions: for contribution in contributions: span.set_tag(contribution, contributions[contribution]) if kwargs: span.log_kv(kwargs)
def add_contribution_model(experiment: Experiment): """ Expose the contribution of that experiment to the report. As this is part of an extension, we bubble it up to the experiment itself for rendering purpose. """ for extension in experiment.get("extensions", []): contributions = extension.get("contributions") if contributions: experiment["contributions"] = contributions break
def initialize_run_journal(experiment: Experiment) -> Journal: return { "chaoslib-version": __version__, "platform": platform.platform(), "node": platform.node(), "experiment": experiment.copy(), "start": datetime.utcnow().isoformat(), "status": None, "deviated": False, "steady_states": {"before": None, "after": None, "during": []}, "run": [], "rollbacks": [], }
def get_context_controls( level: str, experiment: Experiment = None, # noqa: C901 context: Union[Activity, Experiment] = None, ) -> List[Control]: """ Get the controls at the given level by merging those declared at the experiment level with the current's context. If a control is declared at the current level, do override it with an top-level ine. """ glbl_controls = get_global_controls() if not experiment: return glbl_controls top_level_controls = experiment.get("controls", []) controls = copy(context.get("controls", [])) controls.extend(glbl_controls) # do we even have something at the top level to be merged? if not top_level_controls: return controls if not controls: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] if level in ["method", "rollback"]: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] for c in controls: if "ref" in c: for top_level_control in top_level_controls: if c["ref"] == top_level_control["name"]: controls.append(deepcopy(top_level_control)) break else: for tc in top_level_controls: if c.get("name") == tc.get("name"): break else: if tc.get("automatic", True): controls.append(deepcopy(tc)) return controls
def get_extension(experiment: Experiment, name: str) -> Optional[Extension]: """ Get an extension by its name. If no extensions were defined, or the extension doesn't exist in this experiment, return `None`. """ extensions = experiment.get("extensions") if not extensions: return None for ext in extensions: ext_name = ext.get("name") if ext_name == name: return ext return None
def apply_python_control( level: str, control: Control, # noqa: C901 experiment: Experiment, context: Union[Activity, Experiment], state: Union[Journal, Run, List[Run]] = None, configuration: Configuration = None, secrets: Secrets = None, settings: Settings = None, ): """ Apply a control by calling a function matching the given level. """ provider = control["provider"] func_name = _level_mapping.get(level) func = load_func(control, func_name) if not func: return arguments = deepcopy(provider.get("arguments", {})) if configuration or secrets: arguments = substitute(arguments, configuration, secrets) sig = inspect.signature(func) if "secrets" in sig.parameters: arguments["secrets"] = secrets if "configuration" in sig.parameters: arguments["configuration"] = configuration if "state" in sig.parameters: arguments["state"] = state if "experiment" in sig.parameters: arguments["experiment"] = experiment if "extensions" in sig.parameters: arguments["extensions"] = experiment.get("extensions") if "settings" in sig.parameters: arguments["settings"] = settings func(context=context, **arguments)
def get_context_controls(level: str, experiment: Experiment, context: Union[Activity, Experiment]) \ -> List[Control]: """ Get the controls at the given level by merging those declared at the experiment level with the current's context. If a control is declared at the current level, do override it with an top-level ine. """ top_level_controls = experiment.get("controls", []) controls = context.get("controls", []) if not controls: if not top_level_controls: return [] else: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] if level in ["method", "rollback"]: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] for c in controls.copy(): if "ref" in c: for top_level_control in top_level_controls: if c["ref"] == top_level_control["name"]: controls.append(deepcopy(top_level_control)) break else: for tc in top_level_controls: if c.get("name") == tc.get("name"): break else: if tc.get("automatic", True): controls.append(deepcopy(tc)) return controls
def ensure_hypothesis_is_valid(experiment: Experiment): """ Validates that the steady state hypothesis entry has the expected schema or raises :exc:`InvalidExperiment` or :exc:`InvalidActivity`. """ hypo = experiment.get("steady-state-hypothesis") if hypo is None: return if not hypo.get("title"): raise InvalidExperiment("hypothesis requires a title") probes = hypo.get("probes") if probes: for probe in probes: ensure_activity_is_valid(probe) if "tolerance" not in probe: raise InvalidActivity("hypothesis probe must have a tolerance entry") ensure_hypothesis_tolerance_is_valid(probe["tolerance"])
def run_steady_state_hypothesis(experiment: Experiment, configuration: Configuration, secrets: Secrets, dry: bool = False): """ Run all probes in the hypothesis and fail the experiment as soon as any of the probe fails or is outside the tolerance zone. """ state = { "steady_state_met": None, "probes": [] } hypo = experiment.get("steady-state-hypothesis") if not hypo: logger.info( "No steady state hypothesis defined. That's ok, just exploring.") return logger.info("Steady state hypothesis: {h}".format(h=hypo.get("title"))) probes = hypo.get("probes", []) for activity in probes: run = execute_activity( activity, configuration=configuration, secrets=secrets, dry=dry) run["tolerance_met"] = True state["probes"].append(run) if dry: # do not check for tolerance when dry mode is on continue tolerance = activity.get("tolerance") logger.debug("allowed tolerance is {t}".format(t=str(tolerance))) if not within_tolerance(tolerance, run["output"]): run["tolerance_met"] = False state["steady_state_met"] = False return state state["steady_state_met"] = True logger.info("Steady state hypothesis is met!") return state
def warn_about_deprecated_features(experiment: Experiment): """ Warn about deprecated features. We do it globally so that we can warn only once about each feature and avoid repeating the same message over and over again. """ warned_deprecations = { DeprecatedDictArgsMessage: False, DeprecatedVaultMissingPathMessage: False } activities = get_all_activities_in_experiment(experiment) for activity in activities: provider = activity.get("provider") if not provider: continue provider_type = provider.get("type") if provider_type == "process": arguments = provider.get("arguments") if not warned_deprecations[DeprecatedDictArgsMessage] and \ isinstance(arguments, dict): warned_deprecations[DeprecatedDictArgsMessage] = True warnings.warn(DeprecatedDictArgsMessage, DeprecationWarning) logger.warning(DeprecatedDictArgsMessage) # vault now expects the path property # see https://github.com/chaostoolkit/chaostoolkit-lib/issues/77 for (target, keys) in experiment.get("secrets", {}).items(): for (key, value) in keys.items(): if isinstance(value, dict) and value.get("type") == "vault": if "key" in value and "path" not in value: warned_deprecations[ DeprecatedVaultMissingPathMessage] = True warnings.warn(DeprecatedVaultMissingPathMessage, DeprecationWarning) logger.warning(DeprecatedVaultMissingPathMessage)
def initialize_execution(session: Session, experiment: Experiment, journal: Journal) -> Optional[Response]: """ Initialize the execution payload and send it over. """ experiment_id = get_experiment_id(experiment.get('extensions')) if not experiment_id: logger.info("Missing experiment identifier") return journal["experiment"] = experiment journal["status"] = "running" execution_url = urls.execution( urls.experiment(session.base_url, experiment_id=experiment_id)) try: with remove_sensitive_extension_values(journal["experiment"], ["experiment_path"]): data = json.dumps({"journal": journal}, ensure_ascii=False, default=json_encoder) r = session.post(execution_url, data=data, headers={"content-type": "application/json"}) except Exception: logger.debug("Failed to create execution", exc_info=True) return if r.status_code not in [200, 201]: is_json = 'application/json' in r.headers.get("content-type", '') error = r.json() if is_json else r.text logger.warning("Execution failed to be published: {}".format(error)) else: logger.info("Execution available at {}".format( urls.clean(r.headers["Content-Location"]))) payload = r.json() set_execution_id(payload["id"], experiment) return r
def ensure_verification_is_valid(experiment: Experiment): ensure_experiment_is_valid(experiment) extensions = experiment.get("extensions") if extensions is None: raise InvalidVerification( "a verification must have an extensions block") chaosiq_blocks = list( filter(lambda extension: extension.get("name", "") == "chaosiq", extensions)) if not len(chaosiq_blocks) == 1: raise InvalidVerification( "a verification must have a single chaosiq extension block") verification = chaosiq_blocks[0].get("verification") if verification is None: raise InvalidVerification( "a verification must have a verification block") id = verification.get("id") if id is None: raise InvalidVerification("a verification must have an id") frequency_of_measurement = verification.get("frequency-of-measurement") if frequency_of_measurement is None: raise InvalidVerification( "a verification must have a frequency-of-measurement block") duration_of_conditions = verification.get("duration-of-conditions") if duration_of_conditions is None: raise InvalidVerification( "a verification must have a duration-of-conditions block") logger.info("Verification looks valid")
def get_org_id(experiment: Experiment) -> str: extensions = experiment.get("extensions", []) for extension in extensions: if extension["name"] == "chaosiq": return extension.get("org_id")
def apply_activities(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, journal: Journal, dry: bool = False) -> List[Run]: with controls(level="method", experiment=experiment, context=experiment, configuration=configuration, secrets=secrets) as control: result = [] runs = [] method = experiment.get("method", []) wait_for_background_activities = True try: for run in run_activities(experiment, configuration, secrets, pool, dry): runs.append(run) if journal["status"] in ["aborted", "failed", "interrupted"]: break except SystemExit as x: # when we got a signal for an ungraceful exit, we can decide # not to wait for background activities. Their statuses will # remain failed. wait_for_background_activities = x.code != 30 # see exit.py raise finally: background_activity_timeout = None if wait_for_background_activities and pool: logger.debug("Waiting for background activities to complete") pool.shutdown(wait=True) elif pool: harshly_terminate_pending_background_activities(pool) logger.debug( "Do not wait for the background activities to finish " "as per signal") background_activity_timeout = 0.2 pool.shutdown(wait=False) for index, run in enumerate(runs): if not run: continue if isinstance(run, dict): result.append(run) else: try: # background activities result.append( run.result(timeout=background_activity_timeout)) except TimeoutError: # we want an entry for the background activity in our # results anyway, we won't have anything meaningful # to say about it result.append({ "activity": method[index], "status": "failed", "output": None, "duration": None, "start": None, "end": None, "exception": None }) # now let's ensure the journal has all activities in their correct # order (background ones included) journal["run"] = result control.with_state(result) return result
def _run( self, strategy: Strategy, schedule: Schedule, # noqa: C901 experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, settings: Settings, event_registry: EventHandlerRegistry) -> None: experiment["title"] = substitute(experiment["title"], configuration, secrets) logger.info("Running experiment: {t}".format(t=experiment["title"])) started_at = time.time() journal = journal or initialize_run_journal(experiment) event_registry.started(experiment, journal) control = Control() activity_pool, rollback_pool = get_background_pools(experiment) hypo_pool = get_hypothesis_pool() continous_hypo_event = threading.Event() dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") initialize_global_controls(experiment, configuration, secrets, settings) initialize_controls(experiment, configuration, secrets) logger.info("Steady-state strategy: {}".format(strategy.value)) rollback_strategy = settings.get("runtime", {}).get("rollbacks", {}).get( "strategy", "default") logger.info("Rollbacks strategy: {}".format(rollback_strategy)) exit_gracefully_with_rollbacks = True with_ssh = has_steady_state_hypothesis_with_probes(experiment) if not with_ssh: logger.info("No steady state hypothesis defined. That's ok, just " "exploring.") try: try: control.begin("experiment", experiment, experiment, configuration, secrets) state = object() if with_ssh and should_run_before_method(strategy): state = run_gate_hypothesis(experiment, journal, configuration, secrets, event_registry, dry) if state is not None: if with_ssh and should_run_during_method(strategy): run_hypothesis_during_method(hypo_pool, continous_hypo_event, strategy, schedule, experiment, journal, configuration, secrets, event_registry, dry) state = run_method(strategy, activity_pool, experiment, journal, configuration, secrets, event_registry, dry) continous_hypo_event.set() if journal["status"] not in ["interrupted", "aborted"]: if with_ssh and (state is not None) and \ should_run_after_method(strategy): run_deviation_validation_hypothesis( experiment, journal, configuration, secrets, event_registry, dry) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) event_registry.interrupted(experiment, journal) except KeyboardInterrupt: journal["status"] = "interrupted" logger.warning("Received a termination signal (Ctrl-C)...") event_registry.signal_exit() except SystemExit as x: journal["status"] = "interrupted" logger.warning("Received the exit signal: {}".format(x.code)) exit_gracefully_with_rollbacks = x.code != 30 if not exit_gracefully_with_rollbacks: logger.warning("Ignoring rollbacks as per signal") event_registry.signal_exit() finally: hypo_pool.shutdown(wait=True) # just in case a signal overrode everything else to tell us not to # play them anyway (see the exit.py module) if exit_gracefully_with_rollbacks: run_rollback(rollback_strategy, rollback_pool, experiment, journal, configuration, secrets, event_registry, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at # the spec only allows these statuses, so if it's anything else # we override to "completed" if journal["status"] not in ("completed", "failed", "aborted", "interrupted"): journal["status"] = "completed" has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, configuration, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: try: cleanup_controls(experiment) cleanup_global_controls() finally: event_registry.finish(journal) return journal