コード例 #1
0
ファイル: actions.py プロジェクト: pravarag/chaostoolkit-aws
def detach_random_instances(asg_names: List[str] = None,
                            tags: List[dict] = None,
                            instance_count: int = None,
                            instance_percent: int = None,
                            decrement_capacity: bool = False,
                            configuration: Configuration = None,
                            secrets: Secrets = None) -> AWSResponse:
    """
    Detaches one or more random instances from an autoscaling group

    Parameters:
        One of:
            asg_names: a list of one or more asg names
            tags: a list of key/value pair to identify asg(s) by

        One of:
            instance_count: integer value of number of instances to detach
            instance_percent: 1-100, percent of instances to detach

        decrement_capacity: boolean value to determine if the desired capacity
        of the autoscaling group should be decreased

    `tags` are expected as a list of dictionary objects:
    [
        {'Key': 'TagKey1', 'Value': 'TagValue1'},
        {'Key': 'TagKey2', 'Value': 'TagValue2'},
        ...
    ]
    """
    validate_asgs(asg_names, tags)

    if not any([instance_count, instance_percent]):
        raise FailedActivity('You must specify either "instance_count" or '
                             '"instance_percent"')

    client = aws_client('autoscaling', configuration, secrets)

    if asg_names:
        asgs = get_asg_by_name(asg_names, client)
    else:
        asgs = get_asg_by_tags(tags, client)

    results = {}
    for a in asgs['AutoScalingGroups']:
        # Filter out all instances not currently 'InService'
        instances = [
            e['InstanceId'] for e in a['Instances']
            if e['LifecycleState'] == 'InService'
        ]

        if instance_percent:
            instance_count = int(
                float(len(instances) * float(instance_percent)) / 100)

        if instance_count > len(instances):
            raise FailedActivity('You are attempting to detach more instances '
                                 'than exist on asg %s' %
                                 (a['AutoScalingGroupName']))

        instances = random.sample(instances, instance_count)

        response = client.detach_instances(
            AutoScalingGroupName=a['AutoScalingGroupName'],
            InstanceIds=sorted(instances),
            ShouldDecrementDesiredCapacity=decrement_capacity)
        results.setdefault('Activities', []).extend(response['Activities'])
    return results
コード例 #2
0
def chaosansible_run(
    host_list: list = ("localhost"),
    configuration: Configuration = None,
    facts: bool = False,
    become: bool = False,
    run_once: bool = False,
    ansible: dict = {},
    num_target: str = "all",
    secrets: Secrets = None,
):

    """
    Run a task through ansible and eventually gather facts from host
    """

    # Check for correct inputs
    if ansible:
        if ansible.get("module") is None:
            raise InvalidActivity("No ansible module defined")

        if ansible.get("args") is None:
            raise InvalidActivity("No ansible module args defined")

    configuration = configuration or {}

    # Ansible configuration elements
    module_path = configuration.get("ansible_module_path")
    become_user = configuration.get("ansible_become_user")
    ssh_key_path = configuration.get("ansible_ssh_private_key")
    ansible_user = configuration.get("ansible_user")
    become_ask_pass = configuration.get("become_ask_pass")
    ssh_extra_args = configuration.get("ansible_ssh_extra_args")

    context.CLIARGS = ImmutableDict(
        connection="smart",
        verbosity=0,
        module_path=module_path,
        forks=10,
        become=become,
        become_method="sudo",
        become_user=become_user,
        check=False,
        diff=False,
        private_key_file=ssh_key_path,
        remote_user=ansible_user,
        ssh_extra_args=ssh_extra_args,
    )

    # Update host_list regarding the number of desired target.
    # Need to generate a new host-list because after being update
    # and will be used later
    if num_target != "all":
        new_host_list = random_host(host_list, int(num_target))
    else:
        new_host_list = host_list[:]

    # Create an inventory
    sources = ",".join(new_host_list)
    if len(new_host_list) == 1:
        sources += ","

    loader = DataLoader()
    inventory = InventoryManager(loader=loader, sources=sources)

    # Instantiate callback for storing results
    results_callback = ResultsCollectorJSONCallback()

    variable_manager = VariableManager(loader=loader, inventory=inventory)
    if become_ask_pass:
        passwords = dict(become_pass=become_ask_pass)
    else:
        passwords = None

    # Ansible taskmanager
    tqm = TaskQueueManager(
        inventory=inventory,
        variable_manager=variable_manager,
        loader=loader,
        passwords=passwords,
        stdout_callback=results_callback,
        run_additional_callbacks=False,
    )

    # Ansible playbook
    play_source = dict(
        name="Ansible Play",
        hosts=new_host_list,
        gather_facts=facts,
        tasks=[
            dict(
                name="facts",
                action=dict(module="debug", args=dict(var="ansible_facts")),
            ),
        ],
    )

    # In cas we only want to gather facts
    if ansible:
        module = ansible.get("module")
        args = ansible.get("args")
        play_source["tasks"].append(
            dict(
                name="task",
                run_once=run_once,
                action=dict(module=module, args=args),
                register="shell_out",
            )
        )

    # Create an ansible playbook
    play = Play().load(play_source,
                       variable_manager=variable_manager,
                       loader=loader)

    # Run it
    try:
        result = tqm.run(play)
    finally:
        tqm.cleanup()
        if loader:
            loader.cleanup_all_tmp_files()

    # Remove ansible tmpdir
    shutil.rmtree(C.DEFAULT_LOCAL_TMP, True)

    if len(results_callback.host_failed) > 0:
        print("Ansible error(s): ")
        for error in results_callback.host_failed:
            print(results_callback.host_failed[error].__dict__)

        raise FailedActivity("Failed to run ansible task")

    elif len(results_callback.host_unreachable) > 0:
        print("Unreachable host(s): ")
        for error in results_callback.host_unreachable:
            print(error)

        raise FailedActivity("At least one target is down")

    else:
        results = {}

        for host, result in results_callback.host_ok.items():
            results[host] = result

        return json.dumps(results)
コード例 #3
0
def network_latency(filter: str = None,
                    duration: int = 60,
                    delay: int = 200,
                    jitter: int = 50,
                    timeout: int = 60,
                    configuration: Configuration = None,
                    secrets: Secrets = None):
    """
    Increases the response time of the virtual machine.

    Parameters
    ----------
    filter : str, optional
        Filter the virtual machines. If the filter is omitted all machines in
        the subscription will be selected as potential chaos candidates.
    duration : int, optional
        How long the latency lasts. Defaults to 60 seconds.
    timeout : int
        Additional wait time (in seconds) for filling operation to be completed
        Getting and sending data from/to Azure may take some time so it's not
        recommended to set this value to less than 30s. Defaults to 60 seconds.
    delay : int
        Added delay in ms. Defaults to 200.
    jitter : int
        Variance of the delay in ms. Defaults to 50.


    Examples
    --------
    Some calling examples. Deep dive into the filter syntax:
    https://docs.microsoft.com/en-us/azure/kusto/query/

    >>> network_latency("where resourceGroup=='rg'", configuration=c,
                    secrets=s)
    Increase the latency of all machines from the group 'rg'

    >>> network_latency("where resourceGroup=='rg' and name='name'",
                    configuration=c, secrets=s)
    Increase the latecy of the machine from the group 'rg' having the name
    'name'

    >>> network_latency("where resourceGroup=='rg' | sample 2",
                    configuration=c, secrets=s)
    Increase the latency of two machines at random from the group 'rg'
    """

    logger.debug(
        "Start network_latency: configuration='{}', filter='{}'".format(
            configuration, filter))

    machines = __fetch_machines(filter, configuration, secrets)
    client = __compute_mgmt_client(secrets, configuration)

    for m in machines:
        name = m['name']
        group = m['resourceGroup']
        os_type = __get_os_type(m)
        if os_type == OS_LINUX:
            command_id = 'RunShellScript'
            script_name = "network_latency.sh"
        else:
            raise FailedActivity("Cannot run network latency test on OS: %s" %
                                 os_type)

        with open(
                os.path.join(os.path.dirname(__file__), "scripts",
                             script_name)) as file:
            script_content = file.read()

        logger.debug("Script content: {}".format(script_content))
        parameters = {
            'command_id':
            command_id,
            'script': [script_content],
            'parameters': [{
                'name': "duration",
                'value': duration
            }, {
                'name': "delay",
                'value': delay
            }, {
                'name': "jitter",
                'value': jitter
            }]
        }

        logger.debug("Increasing the latency of machine: {}".format(name))
        poller = client.virtual_machines.run_command(group, name, parameters)
        result = poller.result(duration + timeout)  # Blocking till executed
        logger.debug("Execution result: {}".format(poller))
        if result:
            logger.debug(result.value[0].message)  # stdout/stderr
        else:
            raise FailedActivity(
                "network_latency operation did not finish on time. "
                "You may consider increasing timeout setting.")
コード例 #4
0
                                       mocked_init_client.return_value)
    mocked_command_run.assert_called_with(scale_set['resourceGroup'],
                                          instance,
                                          parameters=ANY,
                                          client=mocked_client)


@patch('pdchaosazure.vmss.actions.fetch_vmss', autospec=True)
@patch('pdchaosazure.vmss.actions.fetch_instances', autospec=True)
@patch('pdchaosazure.vmss.actions.client.init', autospec=True)
@patch.object(pdchaosazure.common.compute.command,
              'prepare_path',
              autospec=True)
@patch.object(pdchaosazure.common.compute.command,
              'run',
              side_effect=FailedActivity("Activity monkey has failed"))
def test_unhappily_fill_disk(mocked_command_run, mocked_command_prepare_path,
                             mocked_init_client, fetch_instances, fetch_vmss):
    # arrange mocks
    mocked_command_prepare_path.return_value = '/root/burn/hard'

    scale_set = vmss_provider.provide_scale_set()
    instance = vmss_provider.provide_instance()
    fetch_vmss.return_value = [scale_set]
    fetch_instances.return_value = [instance]

    configuration = config_provider.provide_default_config()
    secrets = secrets_provider.provide_secrets_via_service_principal()

    mocked_client = MockComputeManagementClient()
    mocked_init_client.return_value = mocked_client
コード例 #5
0
def drain_nodes(name: str = None,
                label_selector: str = None,
                delete_pods_with_local_storage: bool = False,
                timeout: int = 120,
                secrets: Secrets = None) -> bool:
    """
    Drain nodes matching the given label or name, so that no pods are scheduled
    on them any longer and running pods are evicted.

    It does a similar job to `kubectl drain --ignore-daemonsets` or
    `kubectl drain --delete-local-data --ignore-daemonsets` if
    `delete_pods_with_local_storage` is set to `True`. There is no
    equivalent to the `kubectl drain --force` flag.

    You probably want to call `uncordon` from in your experiment's rollbacks.
    """
    # first let's make the node unschedulable
    cordon_node(name=name, label_selector=label_selector, secrets=secrets)

    api = create_k8s_api_client(secrets)

    v1 = client.CoreV1Api(api)
    if name:
        ret = v1.list_node(field_selector="metadata.name={}".format(name))

        logger.debug("Found {d} node named '{s}'".format(d=len(ret.items),
                                                         s=name))
    else:
        ret = v1.list_node(label_selector=label_selector)

        logger.debug("Found {d} node(s) labelled '{s}'".format(
            d=len(ret.items), s=label_selector))

    nodes = ret.items
    if not nodes:
        raise FailedActivity(
            "failed to find a node that matches selector {}".format(
                label_selector))

    for node in nodes:
        node_name = node.metadata.name
        ret = v1.list_pod_for_all_namespaces(
            include_uninitialized=True,
            field_selector="spec.nodeName={}".format(node_name))

        logger.debug("Found {d} pods on node '{n}'".format(d=len(ret.items),
                                                           n=node_name))

        if not ret.items:
            continue

        # following the drain command from kubectl as best as we can
        eviction_candidates = []
        for pod in ret.items:
            name = pod.metadata.name
            phase = pod.status.phase
            volumes = pod.spec.volumes
            annotations = pod.metadata.annotations

            # do not handle mirror pods
            if annotations and "kubernetes.io/config.mirror" in annotations:
                logger.debug("Not deleting mirror pod '{}' on "
                             "node '{}'".format(name, node_name))
                continue

            if any(filter(lambda v: v.empty_dir is not None, volumes)):
                logger.debug("Pod '{}' on node '{}' has a volume made "
                             "of a local storage".format(name, node_name))
                if not delete_pods_with_local_storage:
                    logger.debug("Not evicting a pod with local storage")
                    continue
                logger.debug("Deleting anyway due to flag")
                eviction_candidates.append(pod)
                continue

            if phase in ["Succeeded", "Failed"]:
                eviction_candidates.append(pod)
                continue

            for owner in pod.metadata.owner_references:
                if owner.controller and owner.kind != "DaemonSet":
                    eviction_candidates.append(pod)
                    break
                elif owner.kind == "DaemonSet":
                    logger.debug(
                        "Pod '{}' on node '{}' is owned by a DaemonSet. Will "
                        "not evict it".format(name, node_name))
                    break
            else:
                raise FailedActivity(
                    "Pod '{}' on node '{}' is unmanaged, cannot drain this "
                    "node. Delete it manually first?".format(name, node_name))

        if not eviction_candidates:
            logger.debug("No pods to evict. Let's return.")
            return True

        logger.debug("Found {} pods to evict".format(len(eviction_candidates)))
        for pod in eviction_candidates:
            eviction = client.V1beta1Eviction()

            eviction.metadata = client.V1ObjectMeta()
            eviction.metadata.name = pod.metadata.name
            eviction.metadata.namespace = pod.metadata.namespace

            eviction.delete_options = client.V1DeleteOptions()
            try:
                v1.create_namespaced_pod_eviction(pod.metadata.name,
                                                  pod.metadata.namespace,
                                                  body=eviction)
            except ApiException as x:
                raise FailedActivity("Failed to evict pod {}: {}".format(
                    pod.metadata.name, x.body))

        pods = eviction_candidates[:]
        started = time.time()
        while True:
            logger.debug("Waiting for {} pods to go".format(len(pods)))

            if time.time() - started > timeout:
                remaining_pods = "\n".join([p.metadata.name for p in pods])
                raise FailedActivity(
                    "Draining nodes did not completed within {}s. "
                    "Remaining pods are:\n{}".format(timeout, remaining_pods))

            pending_pods = pods[:]
            for pod in pods:
                try:
                    p = v1.read_namespaced_pod(pod.metadata.name,
                                               pod.metadata.namespace)
                    # rescheduled elsewhere?
                    if p.metadata.uid != pod.metadata.uid:
                        pending_pods.remove(pod)
                        continue
                    logger.debug("Pod '{}' still around in phase: {}".format(
                        p.metadata.name, p.status.phase))
                except ApiException as x:
                    if x.status == 404:
                        # gone...
                        pending_pods.remove(pod)
            pods = pending_pods[:]
            if not pods:
                logger.debug("Evicted all pods we could")
                break

            time.sleep(10)

        return True
コード例 #6
0
def get_metric_data(
    namespace: str,
    metric_name: str,
    dimension_name: str = None,
    dimension_value: str = None,
    dimensions: List[Dict[str, str]] = None,
    statistic: str = None,
    duration: int = 300,
    period: int = 60,
    offset: int = 0,
    unit: str = None,
    configuration: Configuration = None,
    secrets: Secrets = None,
) -> float:
    """Gets metric data for a given metric in a given time period. This method
    allows for more data to be retrieved than get_metric_statistics

    :params
        namespace: The AWS metric namespace
        metric_name: The name of the metric to pull data for
        One of:
            dimension_name, dimension_value: Required to search for ONE dimension
            dimensions: Required to search for dimensions combinations
            Are expected as a list of dictionary objects:
            [{‘Name’: ‘Dim1’, ‘Value’: ‘Val1’}, {‘Name’: ‘Dim2’, ‘Value’: ‘Val2’}, …]
        unit: The type of unit desired to be collected
        statistic: The type of data to return.
            One of: Average, Sum, Minimum, Maximum, SampleCount
        period: The window in which to pull datapoints for
        offset: The time (seconds) to offset the endtime (from now)
        duration: The time (seconds) to set the start time (from now)
    """
    start_time = datetime.utcnow() - timedelta(seconds=duration)
    end_time = datetime.utcnow() - timedelta(seconds=offset)

    if dimensions is None and dimension_name is None and dimension_value is None:
        raise FailedActivity("You must supply argument for dimensions")

    args = {
        "MetricDataQueries": [{
            "Id": "m1",
            "MetricStat": {
                "Metric": {
                    "Namespace": namespace,
                    "MetricName": metric_name,
                },
                "Period": period,
                "Stat": statistic,
            },
            "Label": metric_name,
        }],
        "StartTime":
        start_time,
        "EndTime":
        end_time,
    }

    if dimensions:
        args["MetricDataQueries"][0]["MetricStat"]["Metric"][
            "Dimensions"] = dimensions
    elif dimension_name and dimension_value:
        args["MetricDataQueries"][0]["MetricStat"]["Metric"]["Dimensions"] = [{
            "Name":
            dimension_name,
            "Value":
            dimension_value
        }]

    if unit:
        args["MetricDataQueries"][0]["MetricStat"]["Unit"] = unit

    client = aws_client("cloudwatch", configuration, secrets)
    response = client.get_metric_data(**args)["MetricDataResults"]

    results = {}
    for r in response:
        results.setdefault(r["Label"], []).extend(r["Values"])

    result = 0
    for k, v in results.items():
        if not v:
            continue

        if statistic == "Sum":
            result = sum(v)
        elif statistic == "Minimum":
            result = min(v)
        elif statistic == "Maximum":
            result = max(v)
        else:
            result = mean(v)

    return round(result, 2)
コード例 #7
0
ファイル: probes.py プロジェクト: xpdable/chaostoolkit-aws
def get_metric_statistics(namespace: str,
                          metric_name: str,
                          dimension_name: str,
                          dimension_value: str,
                          duration: int = 60,
                          offset: int = 0,
                          statistic: str = None,
                          extended_statistic: str = None,
                          unit: str = None,
                          configuration: Configuration = None,
                          secrets: Secrets = None):
    """
    Get the value of a statistical calculation for a given metric.

    The period for which the calculation will be performed is specified by a duration and
    an offset from the current time. Both are specified in seconds.

    Example: A duration of 60 seconds and an offset of 30 seconds will yield a
    statistical value based on the time interval between 30 and 90 seconds in the past.

    More information about input parameters are available in the documentation
    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch.html#CloudWatch.Client.get_metric_statistics
    """  # noqa: E501
    client = aws_client("cloudwatch", configuration, secrets)

    if statistic is None and extended_statistic is None:
        raise FailedActivity(
            'You must supply argument for statistic or extended_statistic')

    end_time = datetime.utcnow() - timedelta(seconds=offset)
    start_time = end_time - timedelta(seconds=duration)
    request_kwargs = {
        'Namespace': namespace,
        'MetricName': metric_name,
        'Dimensions': [{
            'Name': dimension_name,
            'Value': dimension_value
        }],
        'StartTime': start_time,
        'EndTime': end_time,
        'Period': duration
    }

    if statistic is not None:
        request_kwargs['Statistics'] = [statistic]
    if extended_statistic is not None:
        request_kwargs['ExtendedStatistics'] = [extended_statistic]
    if unit is not None:
        request_kwargs['Unit'] = unit

    logger.debug('Request arguments: {}'.format(request_kwargs))
    response = client.get_metric_statistics(**request_kwargs)

    datapoints = response['Datapoints']
    if len(datapoints) == 0:
        raise FailedActivity(
            "No datapoints found for metric {}.{}.{}.{}".format(
                namespace, metric_name, dimension_name, dimension_value))

    datapoint = datapoints[0]
    logger.debug('Response: {}'.format(response))
    try:
        if statistic is not None:
            return datapoint[statistic]
        elif extended_statistic is not None:
            return datapoint['ExtendedStatistics'][extended_statistic]
    except Exception as x:
        raise FailedActivity("Unable to parse response '{}': '{}'".format(
            response, str(x)))
def load_credentials(secrets: Secrets = None):
    """
    Load GCP credentials from the experiment secrets

    To authenticate, you need to create a service account manually and either
    pass the filename or the content of the file into the `secrets` object.

    So, in the experiment, use one of the followings:

    ```json
    {
        "gcp": {
            "service_account_file": "/path/to/file.json"
        }
    }
    ```

    ```json
    {
        "gcp": {
            "service_account_info": {
                "type": "service_account",
                "project_id": "...",
                "private_key_id": "...",
                "private_key": "...",
                "client_email": "...",
                "client_id": "...",
                "auth_uri": "https://accounts.google.com/o/oauth2/auth",
                "token_uri": "https://accounts.google.com/o/oauth2/token",
                "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
                "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...."
            }
        }
    }
    ```

    You would likely want to read value from the environment or Vault if you
    use the second approach, and avoid storing sensitive data into the
    experiment itself.

    Make sure your service account has enough permissions for the activities
    you wish to conduct (though do not give it too wide permissions either).

    See: https://developers.google.com/api-client-library/python/auth/service-accounts
    Also: http://google-auth.readthedocs.io/en/latest/reference/google.oauth2.service_account.html
    """  # noqa: E501
    secrets = secrets or {}
    service_account_file = secrets.get("service_account_file")
    service_account_info = secrets.get("service_account_info")

    credentials = None
    if service_account_file:
        service_account_file = os.path.expanduser(service_account_file)
        if not os.path.exists(service_account_file):
            raise FailedActivity("GCP account settings not found at {}".format(
                service_account_file))

        logger.debug(
            "Using GCP credentials from file: {}".format(service_account_file))
        credentials = Credentials.from_service_account_file(
            service_account_file)
    elif service_account_info and isinstance(service_account_info, dict):
        logger.debug("Using GCP credentials embedded into secrets")
        credentials = Credentials.from_service_account_info(
            service_account_info)
    else:
        raise FailedActivity(
            "missing GCP credentials settings in secrets of this activity")

    if credentials is not None and credentials.expired:
        logger.debug("GCP credentials need to be refreshed as they expired")
        credentials.refresh(httplib2.Http())

    if not credentials:
        raise FailedActivity(
            "missing a service account to authenticate with the "
            "Google Cloud Platform")

    return credentials
コード例 #9
0
def fill_disk(filter: str = None,
              duration: int = 120,
              timeout: int = 60,
              size: int = 1000,
              path: str = None,
              configuration: Configuration = None,
              secrets: Secrets = None):
    """
    Fill the disk with random data.

    Parameters
    ----------
    filter : str, optional
        Filter the virtual machines. If the filter is omitted all machines in
        the subscription will be selected as potential chaos candidates.
    duration : int, optional
        Lifetime of the file created. Defaults to 120 seconds.
    timeout : int
        Additional wait time (in seconds)
        for filling operation to be completed.
        Getting and sending data from/to Azure may take some time so it's not
        recommended to set this value to less than 30s. Defaults to 60 seconds.
    size : int
        Size of the file created on the disk. Defaults to 1GB.
    path : str, optional
        The absolute path to write the fill file into.
        Defaults: C:/burn for Windows clients, /root/burn for Linux clients.


    Examples
    --------
    Some calling examples. Deep dive into the filter syntax:
    https://docs.microsoft.com/en-us/azure/kusto/query/

    >>> fill_disk("where resourceGroup=='rg'", configuration=c, secrets=s)
    Fill all machines from the group 'rg'

    >>> fill_disk("where resourceGroup=='rg' and name='name'",
                    configuration=c, secrets=s)
    Fill the machine from the group 'rg' having the name 'name'

    >>> fill_disk("where resourceGroup=='rg' | sample 2",
                    configuration=c, secrets=s)
    Fill two machines at random from the group 'rg'
    """

    logger.debug(
        "Start fill_disk: configuration='{}', filter='{}'".format(
            configuration, filter))

    machines = __fetch_machines(filter, configuration, secrets)
    client = __compute_mgmt_client(secrets, configuration)

    for m in machines:
        name = m['name']
        group = m['resourceGroup']
        os_type = __get_os_type(m)
        if os_type == OS_WINDOWS:
            command_id = 'RunPowerShellScript'
            script_name = "fill_disk.ps1"
            fill_path = "C:/burn" if path is None else path
        elif os_type == OS_LINUX:
            command_id = 'RunShellScript'
            script_name = "fill_disk.sh"
            fill_path = "/root/burn" if path is None else path
        else:
            raise FailedActivity(
                "Cannot run disk filling test on OS: %s" % os_type)

        with open(os.path.join(os.path.dirname(__file__),
                               "../scripts", script_name)) as file:
            script_content = file.read()

        logger.debug("Script content: {}".format(script_content))
        parameters = {
            'command_id': command_id,
            'script': [script_content],
            'parameters': [
                {'name': "duration", 'value': duration},
                {'name': "size", 'value': size},
                {'name': "path", 'value': fill_path}
            ]
        }

        logger.debug("Filling disk of machine: {}".format(name))
        poller = client.virtual_machines.run_command(group, name, parameters)
        result = poller.result(duration + timeout)  # Blocking till executed
        logger.debug("Execution result: {}".format(poller))
        if result:
            logger.debug(result.value[0].message)  # stdout/stderr
        else:
            raise FailedActivity(
                "fill_disk operation did not finish on time. "
                "You may consider increasing timeout setting.")
コード例 #10
0
def modify_instance_groups_shrink_policy(
    cluster_id: str,
    group_id: str,
    decommission_timeout: int = None,
    terminate_instances: List[str] = None,
    protect_instances: List[str] = None,
    termination_timeout: int = None,
    configuration: Configuration = None,
    secrets: Secrets = None,
) -> AWSResponse:
    """Modify an instance groups shrink operations

    :param cluster_id: The cluster id
    :param group_id: The instance group id
    :param decommission_timeout: Timeout for decommissioning an instance
    :param terminate_instances: Instance id list to terminate when shrinking
    :param protect_instances: Instance id list to protect when shrinking
    :param termination_timeout: Override for list of instances to terminate
    :param configuration: access values used by actions/probes
    :param secrets: values that need to be passed on to actions/probes
    :return: Dict[str, Any]
    """
    if not any([decommission_timeout, terminate_instances, protect_instances]):
        raise FailedActivity("Must provide at least one of ["
                             '"decommission_timeout", "terminate_instances",'
                             '"protect_instances"]')

    if termination_timeout and not terminate_instances:
        raise FailedActivity('Must provide "terminate_instances" when '
                             'specifying "termination_timeout"')

    resize_policy = {
        **({
            "InstancesToTerminate": terminate_instances
        } if terminate_instances else {}),
        **({
            "InstancesToProtect": protect_instances
        } if protect_instances else {}),
        **({
            "InstanceTerminationTimeout": termination_timeout
        } if termination_timeout else {}),
    }

    params = {
        "ClusterId":
        cluster_id,
        "InstanceGroups": [{
            "InstanceGroupId": group_id,
            "ShrinkPolicy": {
                **({
                    "DecommissionTimeout": decommission_timeout
                } if decommission_timeout else {}),
                **({
                    "InstanceResizePolicy": resize_policy
                } if resize_policy else {}),
            },
        }],
    }

    client = aws_client("emr", configuration, secrets)

    try:
        client.modify_instance_groups(**params)
        return get_instance_group(client, cluster_id, group_id)
    except ClientError as e:
        logger.exception(e.response["Error"]["Message"])
        raise FailedActivity(e.response["Error"]["Message"])
コード例 #11
0
ファイル: actions.py プロジェクト: xpdable/chaostoolkit-aws
def __linux_from_default(instance_id: str = None,
                         action: str = None,
                         parameters: Dict[str, Any] = None,
                         configuration: Configuration = None,
                         secrets: Secrets = None) -> AWSResponse:

    default_timeout = int(parameters['duration'])
    client = aws_client("ssm", configuration, secrets)
    if not instance_id:
        raise FailedActivity("you must specify the instance_id")
    try:
        if describe_os_type(instance_id, configuration, secrets) == "windows":
            os_type = OS_WINDOWS
        else:
            os_type = OS_LINUX

        res_send_command = client.send_command(
            InstanceIds=[instance_id],
            DocumentName="AWS-RunShellScript",
            # ===============================================
            # TODO if in Windows
            # DocumentName == 'AWS-RunPowerShellScript'
            # ===============================================
            Parameters={
                'commands':
                [construct_script_content(action, os_type, parameters)]
            },
        )
        cmd_id = res_send_command["Command"]["CommandId"]
        logger.info("ssm run command is sent, id {}".format(cmd_id))
        totalwait = 0
        interval = default_timeout / 2
        while True:
            res_list = client.list_command_invocations(CommandId=cmd_id,
                                                       Details=True)
            try:
                cp = res_list['CommandInvocations'][0]['CommandPlugins'][0]
                status = cp['Status']
                if status == "InProgress":
                    time.sleep(interval)
                    totalwait += interval
                    interval = interval / 2 if interval > 1 else 1
                    if totalwait > default_timeout + SSMDEFAULTNETWORKLAGACY:
                        raise FailedActivity(
                            "Script exceeded default timeout {}".format(
                                default_timeout))
                    continue
                elif status == "Failed":
                    break
                elif status == "Success":
                    break
                else:
                    break
            except IndexError:
                time.sleep(1)
                continue
        for command_invocation in res_list['CommandInvocations']:
            for invocation in command_invocation['CommandPlugins']:
                if invocation['Name'] == 'aws:runShellScript':
                    logger.info("ssm run command status {}".format(
                        invocation['Status']))
                    logger.info("ssm rum command result \n{}".format(
                        invocation['Output'].rstrip('\n')))
                    return invocation['Output'].rstrip('\n')
    except Exception as x:
        raise FailedActivity(
            "failed issuing a execute of shell script:\n{}".format(x))
コード例 #12
0
def describe_emr_cluster(client: boto3.client, cluster_id: str) -> AWSResponse:
    try:
        return client.describe_cluster(ClusterId=cluster_id)
    except ClientError as e:
        logger.exception(e.response["Error"]["Message"])
        raise FailedActivity(e.response["Error"]["Message"])
コード例 #13
0
def desired_equals_healthy_tags(tags: List[Dict[str, str]],
                                configuration: Configuration = None,
                                secrets: Secrets = None) -> AWSResponse:
    """
    If desired number matches the number of healthy instances

    for each of the auto-scaling groups matching tags provided

    `tags` are  expected as:
    [{
        'Key': 'KeyName',
        'Value': 'KeyValue'
    },
    ...
    ]

    Returns: bool
    """

    if not tags:
        raise FailedActivity("Non-empty tags is required")

    client = aws_client('autoscaling', configuration, secrets)

    # The following is needed because AWS API does not support filters
    # on auto-scaling groups

    # fetch all ASGs using paginator
    page_iterator = client.get_paginator(
        'describe_auto_scaling_groups').paginate(
            PaginationConfig={'PageSize': 100})
    asg_descrs = {'AutoScalingGroups': []}

    for page in page_iterator:
        asg_descrs['AutoScalingGroups'].extend(page['AutoScalingGroups'])

    filter_set = set(map(lambda x: "=".join([x['Key'], x['Value']]), tags))

    group_sets = list(
        map(
            lambda g: {
                'Name':
                g['AutoScalingGroupName'],
                'Tags':
                set(map(lambda t: "=".join([t['Key'], t['Value']]), g['Tags']))
            }, asg_descrs['AutoScalingGroups']))

    filtered_groups = [
        g['Name'] for g in group_sets if filter_set.issubset(g['Tags'])
    ]

    logger.debug("filtered groups: {}".format(filtered_groups))

    if filtered_groups:
        groups_descr = client.describe_auto_scaling_groups(
            AutoScalingGroupNames=filtered_groups)
    else:
        raise FailedActivity(
            "No auto-scaling groups matched the tags provided")

    return is_desired_equals_healthy(groups_descr)
コード例 #14
0
def run_experiment(experiment: Experiment,
                   settings: Settings = None) -> Journal:
    """
    Run the given `experiment` method step by step, in the following sequence:
    steady probe, action, close probe.

    Activities can be executed in background when they have the
    `"background"` property set to `true`. In that case, the activity is run in
    a thread. By the end of runs, those threads block until they are all
    complete.

    If the experiment has the `"dry"` property set to `False`, the experiment
    runs without actually executing the activities.

    NOTE: Tricky to make a decision whether we should rollback when exiting
    abnormally (Ctrl-C, SIGTERM...). Afterall, there is a chance we actually
    cannot afford to rollback properly. Better bailing to a conservative
    approach. This means we swallow :exc:`KeyboardInterrupt` and
    :exc:`SystemExit` and do not bubble it back up to the caller. We when were
    interrupted, we set the `interrupted` flag of the result accordingly to
    notify the caller this was indeed not terminated properly.
    """
    logger.info("Running experiment: {t}".format(t=experiment["title"]))

    dry = experiment.get("dry", False)
    if dry:
        logger.warning("Dry mode enabled")

    started_at = time.time()
    config = load_configuration(experiment.get("configuration", {}))
    secrets = load_secrets(experiment.get("secrets", {}), config)
    activity_pool, rollback_pool = get_background_pools(experiment)

    journal = initialize_run_journal(experiment)

    try:
        # this may fail the entire experiment right there if any of the probes
        # fail or fall out of their tolerance zone
        try:
            state = run_steady_state_hypothesis(experiment, config, secrets,
                                                dry)
            journal["steady_states"]["before"] = state
            if state is not None and not state["steady_state_met"]:
                p = state["probes"][-1]
                raise FailedActivity(
                    "Steady state probe '{p}' is not in the given tolerance "
                    "so failing this experiment".format(
                        p=p["activity"]["name"]))
        except FailedActivity as a:
            journal["steady_states"]["before"] = state
            journal["status"] = "failed"
            logger.fatal(str(a))
        else:
            try:
                journal["run"] = apply_activities(experiment, config, secrets,
                                                  activity_pool, dry)
            except Exception as x:
                journal["status"] = "aborted"
                logger.fatal(
                    "Experiment ran into an un expected fatal error, "
                    "aborting now.",
                    exc_info=True)
            else:
                try:
                    state = run_steady_state_hypothesis(
                        experiment, config, secrets, dry)
                    journal["steady_states"]["after"] = state
                    if state is not None and not state["steady_state_met"]:
                        p = state["probes"][-1]
                        raise FailedActivity(
                            "Steady state probe '{p}' is not in the given "
                            "tolerance so failing this experiment".format(
                                p=p["activity"]["name"]))
                except FailedActivity as a:
                    journal["status"] = "failed"
                    logger.fatal(str(a))
    except (KeyboardInterrupt, SystemExit):
        journal["status"] = "interrupted"
        logger.warn("Received an exit signal, "
                    "leaving without applying rollbacks.")
    else:
        journal["status"] = journal["status"] or "completed"
        journal["rollbacks"] = apply_rollbacks(experiment, config, secrets,
                                               rollback_pool, dry)

    journal["end"] = datetime.utcnow().isoformat()
    journal["duration"] = time.time() - started_at

    logger.info(
        "Experiment ended with status: {s}".format(s=journal["status"]))

    return journal
コード例 #15
0
ファイル: actions.py プロジェクト: pravarag/chaostoolkit-aws
def terminate_random_instances(asg_names: List[str] = None,
                               tags: List[Dict[str, str]] = None,
                               instance_count: int = None,
                               instance_percent: int = None,
                               az: str = None,
                               configuration: Configuration = None,
                               secrets: Secrets = None) -> List[AWSResponse]:
    """
    Terminates one or more random healthy instances associated to an ALB

    A healthy instance is considered one with a status of 'InService'

    Parameters:
            One Of:
                - asg_names: a list of one or more asg names to target
                - tags: a list of key/value pairs to identify the asgs by

            One Of:
                - instance_count: the number of instances to terminate
                - instance_percent: the percentage of instances to terminate
                - az: the availability zone to terminate instances

    `tags` are expected as a list of dictionary objects:
    [
        {'Key': 'TagKey1', 'Value': 'TagValue1'},
        {'Key': 'TagKey2', 'Value': 'TagValue2'},
        ...
    ]
    """
    validate_asgs(asg_names, tags)

    if not any([instance_count, instance_percent, az]) or all(
        [instance_percent, instance_count, az]):
        raise FailedActivity(
            'Must specify one of "instance_count", "instance_percent", "az"')

    client = aws_client('autoscaling', configuration, secrets)

    if asg_names:
        asgs = get_asg_by_name(asg_names, client)
    else:
        asgs = get_asg_by_tags(tags, client)

    results = []
    for a in asgs['AutoScalingGroups']:
        # Filter out all instances not currently 'InService'
        instances = [
            e for e in a['Instances'] if e['LifecycleState'] == 'InService'
        ]

        if az:
            instances = [e for e in instances if e['AvailabilityZone'] == az]

            if not instances:
                raise FailedActivity(
                    'No instances found in Availability Zone: {}'.format(az))
        else:
            if instance_percent:
                instance_count = int(
                    float(len(instances) * float(instance_percent)) / 100)

            if len(instances) < instance_count:
                raise FailedActivity(
                    'Not enough healthy instances in {} to satisfy '
                    'termination count {} ({})'.format(
                        a['AutoScalingGroupName'], instance_count,
                        len(instances)))

            instances = random.sample(instances, instance_count)

        client = aws_client('ec2', configuration, secrets)
        try:
            response = client.terminate_instances(
                InstanceIds=sorted([e['InstanceId'] for e in instances]))
            results.append({
                'AutoScalingGroupName':
                a['AutoScalingGroupName'],
                'TerminatingInstances':
                response['TerminatingInstances']
            })
        except ClientError as e:
            raise FailedActivity(e.response['Error']['Message'])
    return results
コード例 #16
0
def sf_auth(configuration: Configuration,
            secrets: Secrets) -> ServiceFabricAuth:
    """
    Attempt to load the Service Fabric authentication information from a local
    configuration file or the passed `configuration` mapping. The latter takes
    precedence over the local configuration file.

    If you provide a configuration and secrets dictionary, the returned mapping
    will be created from their content. For instance, you could have:

    Configuration mapping (in your experiment file):
    ```python
    {
        "endpoint": "https://XYZ.westus.cloudapp.azure.com:19080",
        "verify_tls": False,
        "use_ca": False
    }
    ```

    Secrets mapping (in your experiment file):
    ```python
    {
        "azure": {
            "security": "pem",
            "pem_content": {
                "type": "env",
                "key": "AZURE_SERVICE_FABRIC_PEM"
            }
        }
    }
    ```

    In that case, the PEM content will be read from the local environment
    variable `AZURE_SERVICE_FABRIC_PEM` that you will have populated before
    hand. The content will be saved by the extension into a temporary file
    before being used to authenticate.

    You could also simply have that file ready instead:

    Secrets mapping (in your experiment file):
    ```python
    {
        "azure": {
            "security": "pem",
            "pem_path": "./party-cluster-XYZ-client-cert.pem"
        }
    }
    ```

    If you want to load the information from a local Service Fabric
    config file, set the `config_path` key in the `configuration mapping.

    Configuration mapping (in your experiment file):
    ```python
    {
        "config_path": "~/.sfctl/config"
    }
    ```
    The path will be expanded.

    The authentification file should look like this:

    ```ini
    [servicefabric]
    endpoint = https://XYZ.westus.cloudapp.azure.com:19080
    no_verify = true
    use_ca = false
    security = pem
    pem_path = ./party-cluster-XYZ-client-cert.pem
    ```

    No matter the input, the yielded dictionary looks like this:

    ```python
    {
        "endpoint": "https://XYZ.westus.cloudapp.azure.com:19080",
        "verify": False,
        "security": {
            "type": "pem",
            "path": "./party-cluster-XYZ-client-cert.pem"
        }
    }
    ```

    Using this function goes as follows:

    ```python
    with auth(configuration, secrets) as info:
        url = "{}{}".format(
            info["endpoint"], "/Tools/Chaos/$/Start?api-version=6.0")

        r = requests.get(
            url, cert=info["security"]["path"], verify=info["verify"])

    """
    c = configuration or {}
    s = secrets or {}

    config_path = c.get("config_path")
    endpoint = c.get("endpoint", s.get("endpoint"))

    if config_path:
        config_path = os.path.expanduser(config_path)
        if not os.path.exists(config_path):
            raise FailedActivity(
                "Service Fabric configuration file not found at {}".format(
                    config_path))

        with open(config_path) as f:
            parser = configparser.ConfigParser()
            parser.read_file(f)

            pem_path = parser.get("servicefabric", "pem_path")
            if not pem_path:
                raise FailedActivity("cannot find {}".format(pem_path))

            yield {
                "endpoint": parser.get("servicefabric", "endpoint"),
                "verify":
                not (parser.get("servicefabric", "no_verify") != "true"),
                "security": {
                    "type": parser.get("servicefabric", "security"),
                    "path": pem_path
                }
            }

    elif endpoint:
        verify_tls = c.get("verify_tls", s.get("verify_tls", True))
        use_ca = c.get("use_ca", s.get("use_ca", True))
        security_kind = s.get("security", c.get("security", "pem"))
        pem_path = s.get("pem_path", c.get("pem_path", None))
        pem_content = s.get("pem_content", c.get("pem_content", None))

        info = {
            "endpoint": endpoint,
            "verify": verify_tls,
            "security": {
                "type": security_kind,
                "path": pem_path
            }
        }

        if not pem_path or (not os.path.exists(pem_path) and pem_content):
            # the file will be deleted when we leave the context block
            with tempfile.NamedTemporaryFile(mode="w+",
                                             encoding='utf-8') as pem_path:

                pem_path.write(pem_content)
                pem_path.seek(0)

                info["security"]["pem_path"] = pem_path.name
                yield info
        else:
            yield info
    else:
        raise FailedActivity(
            "Service Fabric client needs to know how to authenticate")
コード例 #17
0
ファイル: command.py プロジェクト: mkaszub/chaostoolkit-azure
def __get_os_type(machine):
    os_type = machine['osType']
    if os_type not in (OS_LINUX, OS_WINDOWS):
        raise FailedActivity("Unknown OS Type: %s" % os_type)

    return os_type
コード例 #18
0
def __check_secrets(client_id, client_secret, tenant_id):
    if not client_id or not client_secret or not tenant_id:
        raise FailedActivity("Client could not find Azure credentials")
コード例 #19
0
def get_metric_statistics(
    namespace: str,
    metric_name: str,
    dimension_name: str = None,
    dimension_value: str = None,
    dimensions: List[Dict[str, str]] = None,
    duration: int = 60,
    offset: int = 0,
    statistic: str = None,
    extended_statistic: str = None,
    unit: str = None,
    configuration: Configuration = None,
    secrets: Secrets = None,
):
    """
    Get the value of a statistical calculation for a given metric.

    The period for which the calculation will be performed is specified by a duration and
    an offset from the current time. Both are specified in seconds.

    Example: A duration of 60 seconds and an offset of 30 seconds will yield a
    statistical value based on the time interval between 30 and 90 seconds in the past.

    Is required one of:
        dimension_name, dimension_value: Required to search for ONE dimension
        dimensions: Required to search for dimensions combinations
        Are expected as a list of dictionary objects:
        [{‘Name’: ‘Dim1’, ‘Value’: ‘Val1’}, {‘Name’: ‘Dim2’, ‘Value’: ‘Val2’}, …]

    More information about input parameters are available in the documentation
    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudwatch.html#CloudWatch.Client.get_metric_statistics
    """  # noqa: E501
    client = aws_client("cloudwatch", configuration, secrets)

    if statistic is None and extended_statistic is None:
        raise FailedActivity(
            "You must supply argument for statistic or extended_statistic")

    if dimensions is None and dimension_name is None and dimension_value is None:
        raise FailedActivity("You must supply argument for dimensions")

    end_time = datetime.utcnow() - timedelta(seconds=offset)
    start_time = end_time - timedelta(seconds=duration)
    request_kwargs = {
        "Namespace": namespace,
        "MetricName": metric_name,
        "StartTime": start_time,
        "EndTime": end_time,
        "Period": duration,
    }

    if dimensions is not None:
        request_kwargs["Dimensions"] = dimensions
    else:
        request_kwargs["Dimensions"] = [{
            "Name": dimension_name,
            "Value": dimension_value
        }]

    if statistic is not None:
        request_kwargs["Statistics"] = [statistic]
    if extended_statistic is not None:
        request_kwargs["ExtendedStatistics"] = [extended_statistic]
    if unit is not None:
        request_kwargs["Unit"] = unit

    logger.debug(f"Request arguments: {request_kwargs}")
    response = client.get_metric_statistics(**request_kwargs)

    datapoints = response["Datapoints"]
    if not datapoints:
        return 0

    datapoint = datapoints[0]
    logger.debug(f"Response: {response}")
    try:
        if statistic is not None:
            return datapoint[statistic]
        elif extended_statistic is not None:
            return datapoint["ExtendedStatistics"][extended_statistic]
    except Exception as x:
        raise FailedActivity(
            f"Unable to parse response '{response}': '{str(x)}'")
コード例 #20
0
@patch('chaoscf.api.get_app_by_name', autospec=True, return_value=responses.app)
@patch('chaoscf.api.auth', autospec=True)
def test_get_bind_by_name(auth, mock_get_app_by_name):
    auth.return_value = responses.auth_response
    with requests_mock.mock() as m:
        m.get(
            "https://example.com/v2/apps/" + responses.app["metadata"]["guid"] + "/service_bindings",
            status_code=200, json=responses.binds, complete_qs=True)

        get_bind_by_name("my-bind", config.config, secrets.secrets, app_name="my-app", org_name="my-org")

    mock_get_app_by_name.assert_has_calls([call("my-app", config.config,
                                                secrets.secrets, org_name="my-org", space_name=None)])


@patch('chaoscf.api.get_app_by_name', autospec=True, side_effect=FailedActivity("error"))
@patch('chaoscf.api.auth', autospec=True)
def test_get_bind_by_name_when_app_not_found(auth, mock_get_app_by_name):
    auth.return_value = responses.auth_response
    with requests_mock.mock() as m:
        m.get(
            "https://example.com/v2/apps/" + responses.app["metadata"]["guid"] + "/service_bindings",
            status_code=200, json=responses.binds, complete_qs=True)

        with pytest.raises(FailedActivity):
            get_bind_by_name("my-bind", config.config, secrets.secrets, app_name="my-app", org_name="my-org")

    mock_get_app_by_name.assert_has_calls([call("my-app", config.config,
                                                secrets.secrets, org_name="my-org", space_name=None)])

コード例 #21
0
def stress_cpu(filter: str = None,
               duration: int = 120,
               timeout: int = 60,
               configuration: Configuration = None,
               secrets: Secrets = None):
    """
    Stress CPU up to 100% at random machines.

    Parameters
    ----------
    filter : str, optional
        Filter the virtual machines. If the filter is omitted all machines in
        the subscription will be selected as potential chaos candidates.
    duration : int, optional
        Duration of the stress test (in seconds) that generates high CPU usage.
        Defaults to 120 seconds.
    timeout : int
        Additional wait time (in seconds) for stress operation to be completed.
        Getting and sending data from/to Azure may take some time so it's not
        recommended to set this value to less than 30s. Defaults to 60 seconds.

    Examples
    --------
    Some calling examples. Deep dive into the filter syntax:
    https://docs.microsoft.com/en-us/azure/kusto/query/

    >>> stress_cpu("where resourceGroup=='rg'", configuration=c, secrets=s)
    Stress all machines from the group 'rg'

    >>> stress_cpu("where resourceGroup=='rg' and name='name'",
                    configuration=c, secrets=s)
    Stress the machine from the group 'rg' having the name 'name'

    >>> stress_cpu("where resourceGroup=='rg' | sample 2",
                    configuration=c, secrets=s)
    Stress two machines at random from the group 'rg'
    """

    logger.debug("Start stress_cpu: configuration='{}', filter='{}'".format(
        configuration, filter))

    machines = __fetch_machines(filter, configuration, secrets)
    client = __compute_mgmt_client(secrets, configuration)

    for m in machines:
        name = m['name']
        group = m['resourceGroup']
        os_type = __get_os_type(m)
        if os_type == OS_WINDOWS:
            command_id = 'RunPowerShellScript'
            script_name = "cpu_stress_test.ps1"
        elif os_type == OS_LINUX:
            command_id = 'RunShellScript'
            script_name = "cpu_stress_test.sh"
        else:
            raise FailedActivity("Cannot run CPU stress test on OS: %s" %
                                 os_type)

        with open(
                os.path.join(os.path.dirname(__file__), "scripts",
                             script_name)) as file:
            script_content = file.read()

        parameters = {
            'command_id': command_id,
            'script': [script_content],
            'parameters': [{
                'name': "duration",
                'value': duration
            }]
        }

        logger.debug("Stressing CPU of machine: {}".format(name))
        poller = client.virtual_machines.run_command(group, name, parameters)
        result = poller.result(duration + timeout)  # Blocking till executed
        if result:
            logger.debug(result.value[0].message)  # stdout/stderr
        else:
            raise FailedActivity(
                "stress_cpu operation did not finish on time. "
                "You may consider increasing timeout setting.")
コード例 #22
0
def execute_activity(activity: Activity,
                     configuration: Configuration,
                     secrets: Secrets,
                     dry: bool = False) -> Run:
    """
    Low-level wrapper around the actual activity provider call to collect
    some meta data (like duration, start/end time, exceptions...) during
    the run.
    """
    ref = activity.get("ref")
    if ref:
        activity = lookup_activity(ref)
        if not activity:
            raise FailedActivity(
                "could not find referenced activity '{r}'".format(r=ref))

    pauses = activity.get("pauses", {})
    pause_before = pauses.get("before")
    if pause_before:
        logger.info(
            "Pausing before next activity for {d}s...".format(d=pause_before))
        time.sleep(pause_before)

    if activity.get("background"):
        logger.info("{t}: {n} [in background]".format(
            t=activity["type"].title(), n=activity.get("name")))
    else:
        logger.info("{t}: {n}".format(t=activity["type"].title(),
                                      n=activity.get("name")))

    start = datetime.utcnow()

    run = {"activity": activity.copy(), "output": None}

    result = None
    try:
        # only run the activity itself when not in dry-mode
        if not dry:
            result = run_activity(activity, configuration, secrets)
        run["output"] = result
        run["status"] = "succeeded"
        if result is not None:
            logger.debug("  => succeeded with '{r}'".format(r=result))
        else:
            logger.debug("  => succeeded without any result value")
    except FailedActivity as x:
        error_msg = str(x)
        run["status"] = "failed"
        run["output"] = result
        run["exception"] = traceback.format_exception(type(x), x, None)
        logger.error("  => failed: {x}".format(x=error_msg))
    finally:
        # capture the end time before we pause
        end = datetime.utcnow()
        run["start"] = start.isoformat()
        run["end"] = end.isoformat()
        run["duration"] = (end - start).total_seconds()

        pause_after = pauses.get("after")
        if pause_after:
            logger.info(
                "Pausing after activity for {d}s...".format(d=pause_after))
            time.sleep(pause_after)

    return run