Exemple #1
0
    def set_scaling_state(self,
                          instance_id,
                          value,
                          ttl=None,
                          meta=None,
                          default_date=None):
        if ttl is None: ttl = Cfg.get_duration_secs("ec2.state.default_ttl")
        if default_date is None: default_date = self.context["now"]
        #if value in ["draining"] and instance_id in ["i-0ed9bddf74dd2a2f5", "i-0904bbd267f736227"]: pdb.set_trace()

        meta = {} if meta is None else meta
        previous_value = self.get_scaling_state(instance_id,
                                                meta=meta,
                                                do_not_return_excluded=True)
        date = meta[
            "last_action_date"] if previous_value == value else default_date
        self.set_state(
            "ec2.instance.scaling.last_action_date.%s" % instance_id, date,
            ttl)
        self.set_state(
            "ec2.instance.scaling.last_%s_date.%s" % (value, instance_id),
            date, ttl)
        previous_value = self.get_scaling_state(instance_id, meta=meta)
        return self.set_state("ec2.instance.scaling.state.%s" % instance_id,
                              value, ttl)
def next_call_delay():
    global ctx
    expected_delay = Cfg.get_duration_secs("app.run_period")
    last_call_delay = seconds_since_last_call()
    delta = expected_delay - last_call_delay
    if delta < 0:
        return expected_delay
    return max(int(delta), 0)
 def set_instance_state(self, instance_id, targetgroup_name, value):
     m = re.search("(.*)/([^/]+)/\w+$", targetgroup_name)
     targetgroup_name = m.group(2)
     key = "targetgroup.status.%s.%s" % (targetgroup_name, instance_id)
     self.ec2.set_state(
         key.replace(":", "_"),
         value,
         TTL=Cfg.get_duration_secs("targetgroup.default_state_ttl"))
    def update_pending_command_statuses(self):
        client = self.context["ssm.client"]
        self.run_cmd_states = self.o_state.get_state_json("ssm.events.run_commands", default={
            "Commands": [],
            "FormerResults": {}
            })

        former_results = self.run_cmd_states["FormerResults"]
        cmds           = self.run_cmd_states["Commands"]
        for cmd in cmds:
            command = cmd["Command"]
            args    = cmd["CommandArgs"]
            if "Complete" not in cmd:
                cmd_id            = cmd["Id"]
                paginator         = client.get_paginator('list_command_invocations')
                response_iterator = paginator.paginate(CommandId=cmd_id, Details=True, MaxResults=50)
                for response in response_iterator:
                    for invoc in response["CommandInvocations"]:
                        instance_id = invoc["InstanceId"]
                        status      = invoc["Status"]
                        if (status not in ["Success", "Cancelled", "Failed", "TimedOut", "Undeliverable", 
                                "Terminated", "Delivery Timed Out", "Execution Timed Out"]):
                            continue
                        stdout      = [s.rstrip() for s in io.StringIO(invoc["CommandPlugins"][0]["Output"]).readlines() 
                                if s.startswith("CLONESQUAD-SSM-AGENT-")]
                        bie_msg     = next(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-BIE:"), stdout), None)
                        if not bie_msg:
                            log.log(log.NOTICE, f"Truncated reply from SSM Command Invocation ({cmd_id}/{instance_id}). "
                                "*Cause: SSM exec error? started shell command too verbose? (please limit to 24kBytes max!)")
                        agent_status = "CLONESQUAD-SSM-AGENT-STATUS:"
                        status_msg  = next(filter(lambda s: s.startswith(agent_status), stdout), None)
                        if status_msg is None:
                            status_msg = "ERROR"
                        else:
                            status_msg = status_msg[len(agent_status):]
                        details_msg = list(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-DETAILS:"), stdout))
                        warning_msg = list(filter(lambda s: ":WARNING:" in s, stdout))
                        if len(warning_msg):
                            log.warning(f"Got warning while retrieving SSM RunCommand output for {cmd_id}/{instance_id}/{command}: "
                                    f"{warning_msg}/{details_msg}")

                        result = {
                            "SSMInvocationStatus": status,
                            "Status": status_msg,
                            "Truncated": bie_msg is None,
                            "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.result.default_ttl")
                        }
                        # Keep track if the former result list
                        if instance_id not in former_results: former_results[instance_id] = {}
                        former_results[instance_id][f"{command};{args}"] = result
                        if instance_id not in cmd["ReceivedInstanceIds"]:
                            cmd["ReceivedInstanceIds"].append(instance_id)

                    if set(cmd["ReceivedInstanceIds"]) & set(cmd["InstanceIds"]) == set(cmd["InstanceIds"]):
                        # All invocation results received
                        cmd["Complete"] = True
        self.commands_to_send = []
    def is_maintenance_time(self, fleet=None, meta=None):
        if not self.is_feature_enabled("maintenance_window"):
            return False
        now         = self.context["now"]
        sa          = max(Cfg.get_duration_secs("ssm.feature.maintenance_window.start_ahead"), 30)
        # We compute a predictive jitter to avoid all subleets starting exactly at the same time
        group_name  = self.context["GroupName"]
        jitter_salt = int(misc.sha256(f"{group_name}:{fleet}")[:3], 16) / (16 * 16 * 16) * sa
        jitter      = Cfg.get_abs_or_percent("ssm.feature.maintenance_window.start_ahead.max_jitter", 0, jitter_salt)

        start_ahead = timedelta(seconds=(sa-jitter))
        windows     = copy.deepcopy(self._get_maintenance_windows_for_fleet(fleet=fleet))
        for w in windows:
            window_id= w["WindowId"]
            if "NextExecutionTime" in w:
                end_time = w["NextExecutionTime"] + timedelta(hours=int(w["Duration"]))
                if now >= (w["NextExecutionTime"] - start_ahead) and now < end_time:
                    # We are entering a new maintenance window period. Remember it...
                    self.o_state.set_state(f"ssm.events.maintenance_window.last_next_execution_time.{window_id}", 
                        w["NextExecutionTime"], TTL=self.ttl)
                    self.o_state.set_state(f"ssm.events.maintenance_window.last_next_execution_duration.{window_id}", 
                        w["Duration"], TTL=self.ttl)
                w["_FutureNextExecutionTime"] = w["NextExecutionTime"]
            # SSM maintenance windows do not always have a NextExecutionTime field -=OR=- it contains the future
            #  NextExecutionTime of the next iteration. In both case, we westore it from a backuped one.
            next_execution_time = self.o_state.get_state_date(f"ssm.events.maintenance_window.last_next_execution_time.{window_id}", TTL=self.ttl)
            if next_execution_time is not None:
                w["NextExecutionTime"] = next_execution_time
            if "Duration" not in w:
                next_execution_duration = self.o_state.get_state(f"ssm.events.maintenance_window.last_next_execution_duration.{window_id}", TTL=self.ttl)
                if next_execution_duration is not None:
                    w["Duration"] = next_execution_duration

        valid_windows = [w for w in windows if "NextExecutionTime" in w and "Duration" in w]
        fleetname     = "Main" if fleet is None else fleet
        next_window   = None
        for w in sorted(valid_windows, key=lambda w: w["NextExecutionTime"]):
            end_time   = w["NextExecutionTime"] + timedelta(hours=int(w["Duration"]))
            start_time = w["NextExecutionTime"] - start_ahead
            if now >= start_time and now < end_time:
                if meta is not None: 
                    meta["MatchingWindow"] = w
                    meta["MatchingWindowMessage"] = f"Found ACTIVE matching window for fleet {fleetname} : {w}"
                    meta["StartTime"] = w["NextExecutionTime"]
                    meta["EndTime"] = end_time
                return True
            if ("_FutureNextExecutionTime" in w and w["_FutureNextExecutionTime"] > now and 
                    (next_window is None or w["_FutureNextExecutionTime"] < next_window["_FutureNextExecutionTime"])):
                next_window     = w
        if next_window is not None and meta is not None:
            meta["NextWindowMessage"] = (f"Next SSM Maintenance Window for {fleetname} fleet is '%s/%s in %s "
                f"(Fleet will start ahead at %s)." % (w["WindowId"], w["Name"], (w["_FutureNextExecutionTime"] - now), 
                    w["_FutureNextExecutionTime"] - start_ahead))
        return False
def is_called_too_early():
    global ctx
    delay = Cfg.get_duration_secs("app.run_period")
    delta = sqs.seconds_since_last_call()
    if delta != -1 and delta < delay:
        if misc.is_sam_local():
            log.warning("is_called_too_early disabled because running in SAM!")
            return False
        log.log(log.NOTICE, "Called too early (now=%s, delay=%s => delta_seconds=%s)..." %
                (ctx["now"], delay, delta)) 
        return True
    return False
Exemple #7
0
    def configure_dashboard(self):
        client = self.context["cloudwatch.client"]
        # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often
        now = self.context["now"]
        dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default")
        dashboard_last_state = self.ec2.get_state(
            "cloudwatch.dashboard.use_default.last_state")
        self.ec2.set_state("cloudwatch.dashboard.use_default.last_state",
                           dashboard_state,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        last_dashboad_action = self.ec2.get_state_date(
            "cloudwatch.dashboard.last_action", default=misc.epoch())
        dashboad_update_interval = Cfg.get_duration_secs(
            "cloudwatch.dashboard.update_interval")
        if (str(dashboard_state) == dashboard_last_state
            ) and (now - last_dashboad_action
                   ).total_seconds() < dashboad_update_interval:
            log.debug("Not yet the time to manage the dashboard.")
            return

        if Cfg.get_int("cloudwatch.dashboard.use_default") != 1:
            try:
                client.delete_dashboards(
                    DashboardNames=[self._get_dashboard_name()])
            except:
                pass
        else:
            content = self.load_dashboard()
            log.log(
                log.NOTICE, "Configuring CloudWatch dashboard '%s'..." %
                self._get_dashboard_name())

            response = client.put_dashboard(
                DashboardName=self._get_dashboard_name(),
                DashboardBody=content)
        self.ec2.set_state("cloudwatch.dashboard.last_action",
                           now,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
    def __init__(self, context, state, ec2, targetgroup, cloudwatch):
        global do_not_notify
        do_not_notify = False
        self.context = context
        self.ec2 = ec2
        self.targetgroup = targetgroup
        self.cloudwatch = cloudwatch
        self.state = state
        self.table_name = None
        Cfg.register({
            "notify.event.default_ttl": "minutes=5",
            "notify.event.longterm.max_records,Stable": {
                "DefaultValue":
                50,
                "Format":
                "Integer",
                "Description":
                """Maximum records to hold in the Event-LongTerm DynamodDB table

Setting this value to 0, disable logging to the LongTerm event table.
"""
            },
            "notify.event.longterm.ttl,Stable": {
                "DefaultValue":
                "days=5",
                "Format":
                "Duration",
                "Description":
                """Retention time for Long-Term DynamoDB entries.

This table is used to deep-dive analysis of noticeable events encountered by a CloneSquad deployment. It is mainly used to
improve CloneSquad over time by allowing easy sharing of essential data for remote debugging.
               """
            },
            "notify.event.keep_acked_records": "0",
            "notify.debug.obfuscate_s3_reports": "1",
            "notify.debug.send_s3_reports": "1"
        })
        self.state.register_aggregates([{
            "Prefix":
            "notify.",
            "Compress":
            True,
            "DefaultTTL":
            Cfg.get_duration_secs("notify.event.longterm.ttl"),
            "Exclude": []
        }])
        global notify_mgr
        notify_mgr = self
 def __init__(self, context, ec2):
     self.context = context
     self.ec2 = ec2
     self.state_changed = False
     self.prereqs_done = False
     Cfg.register({
         "targetgroup.debug.inject_fault_status": "",
         "targetgroup.default_state_ttl": "minutes=30",
         "targetgroup.slow_deregister_timeout": "minutes=2"
     })
     self.ec2.register_state_aggregates([{
         "Prefix":
         "targetgroup.status.",
         "Compress":
         True,
         "DefaultTTL":
         Cfg.get_duration_secs("targetgroup.default_state_ttl")
     }])
Exemple #10
0
    def get_prerequisites(self):
        ctx = self.context
        self.table = kvtable.KVTable.create(
            self.context,
            self.context["StateTable"],
            cache_max_age=Cfg.get_duration_secs("statemanager.cache.max_age"))
        for a in self.table_aggregates:
            self.table.register_aggregates(a)
        self.table.reread_table()

        # Retrieve all CloneSquad resources
        misc.initialize_clients(["resourcegroupstaggingapi"], self.context)
        tagging_client = self.context["resourcegroupstaggingapi.client"]
        paginator = tagging_client.get_paginator('get_resources')
        tag_mappings = itertools.chain.from_iterable(
            page['ResourceTagMappingList'] for page in paginator.paginate(
                TagFilters=[{
                    'Key': 'clonesquad:group-name',
                    'Values': [self.context["GroupName"]]
                }]))
        self.clonesquad_resources = list(tag_mappings)
Exemple #11
0
 def stop_instances(self, instance_ids_to_stop):
     now = self.context["now"]
     client = self.context["ec2.client"]
     for instance_id in instance_ids_to_stop:
         try:
             response = R(lambda args, kwargs, r: r["ResponseMetadata"][
                 "HTTPStatusCode"] == 200,
                          client.stop_instances,
                          InstanceIds=[instance_id])
             if response is not None and "StoppingInstances" in response:
                 for i in response["StoppingInstances"]:
                     instance_id = i["InstanceId"]
                     self.set_scaling_state(instance_id, "")
                     self.set_state(
                         "ec2.schedule.instance.last_stop_date.%s" %
                         instance_id,
                         now,
                         TTL=Cfg.get_duration_secs("ec2.state.status_ttl"))
             log.debug(response)
         except Exception as e:
             log.warning("Failed to stop_instance '%s' : %s" %
                         (instance_id, e))
Exemple #12
0
    def configure_alarms(self):
        """ Configure Cloudwatch Alarms for each instance.

            The algorithm needs to manage missing alarm as well updating existing alarms
        """
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        valid_alarms = []
        nb_of_updated_alarms = 0
        max_update_per_batch = Cfg.get_int(
            "cloudwatch.metrics.max_update_per_batch")

        log.log(
            log.NOTICE,
            "Found following Alarm definition key(s) in configuration: %s" %
            [d for d in self.alarm_definitions])

        # Step 1) Create or Update CloudWatch Alarms for running instances
        for instance in self.ec2.get_instances(
                State="pending,running",
                ScalingState="-error,draining,excluded"):
            instance_id = instance["InstanceId"]

            age_secs = (now - instance["LaunchTime"]).total_seconds()
            min_instance_age = Cfg.get_duration_secs(
                "cloudwatch.alarms.min_instance_age")
            if age_secs < min_instance_age:
                log.log(
                    log.NOTICE,
                    "Instance '%s' too young. Wait %d seconds before to set an alarm..."
                    % (instance_id, min_instance_age - age_secs))
                continue

            #Update alarms for this instance
            for alarm_definition in self.alarm_definitions:
                # First, check if an alarm already exists
                alarm_name = self._get_alarm_name(self.context["GroupName"],
                                                  instance["InstanceId"],
                                                  int(alarm_definition))
                existing_alarms = list(
                    filter(lambda x: x['AlarmName'] == alarm_name,
                           self.alarms))

                # Load alarm template
                try:
                    if "Content" not in self.alarm_definitions[
                            alarm_definition]:
                        continue
                    kwargs = self.context.copy()
                    kwargs["InstanceId"] = instance_id
                    alarm_template = self.alarm_definitions[alarm_definition][
                        "Content"].format(**kwargs)
                    alarm = yaml.safe_load(alarm_template)
                except Exception as e:
                    log.exception(
                        "[ERROR] Failed to read YAML alarm file '%s' : %s" %
                        (alarm_template, e))
                    continue
                alarm["AlarmName"] = alarm_name

                valid_alarms.append(alarm_name)

                #Check if an alarm already exist
                existing_alarm = None
                if len(existing_alarms) > 0:
                    existing_alarm = existing_alarms[0]

                    # Check if alarm definition will be the same
                    a = {**existing_alarm, **alarm}
                    # 2020/07/20: CloudWatch Alarm API does not return Tags. Have to deal with
                    #  while comparing the configurations.
                    if "Tags" in a and "Tags" not in existing_alarm:
                        del a["Tags"]
                    if a == existing_alarm:
                        #log.debug("Not updating alarm '%s' as configuration is already ok" % alarm_name)
                        continue

                    # Check if we updated this alarm very recently
                    delta = datetime.now(
                        timezone.utc
                    ) - existing_alarm["AlarmConfigurationUpdatedTimestamp"]
                    if delta < timedelta(minutes=1):
                        log.debug("Alarm '%s' updated to soon" % alarm_name)
                        continue

                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break

                log.log(
                    log.NOTICE,
                    "Updating/creating CloudWatch Alarm '%s' : %s" %
                    (alarm_name, alarm))
                resp = client.put_metric_alarm(**alarm)
                log.debug(Dbg.pprint(resp))

        # Step 2) Destroy CloudWatch Alarms for non existing instances (Garbage Collection)
        for existing_alarm in self.alarms:
            alarm_name = existing_alarm["AlarmName"]
            if not alarm_name.startswith("CloneSquad-%s-i-" %
                                         (self.context["GroupName"])):
                continue
            if alarm_name not in valid_alarms:
                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break
                log.debug("Garbage collection orphan Cloudwatch Alarm '%s'" %
                          alarm_name)
                resp = client.delete_alarms(AlarmNames=[alarm_name])
                log.debug(resp)
                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break
Exemple #13
0
    def __init__(self, context, ec2):
        self.context = context
        self.ec2 = ec2
        self.alarms = None
        self.metrics = []
        Cfg.register({
            "cloudwatch.describe_alarms.max_results": "50",
            "cloudwatch.default_ttl": "days=1",
            "cloudwatch.alarms.max_per_instance": "6",
            "cloudwatch.alarms.min_instance_age": "minutes=3",
            "cloudwatch.configure.max_alarms_deleted_batch_size": "5",
            "cloudwatch.metrics.namespace": "CloneSquad",
            "cloudwatch.metrics.subnamespace": "",
            "cloudwatch.metrics.excluded,Stable": {
                "DefaultValue":
                "",
                "Format":
                "StringList",
                "Description":
                """List of metric pattern names to not send to Cloudwatch

This configuration key is used to do Cost optimization by filtering which CloneSquad Metrics are sent to Cloudwatch.
It support regex patterns.

> Ex: StaticFleet.*;NbOfBouncedInstances

                        """
            },
            "cloudwatch.metrics.data_period": "minutes=2",
            "cloudwatch.metrics.max_update_per_batch": "20",
            "cloudwatch.metrics.cache.max_retention_period": "minutes=10",
            "cloudwatch.metrics.minimum_polled_alarms_per_run": "1",
            "cloudwatch.metrics.time_for_full_metric_refresh,Stable": {
                "DefaultValue":
                "minutes=1,seconds=30",
                "Format":
                "Duration",
                "Description":
                """The total period for a complete refresh of EC2 Instance metrics

This parameter is a way to reduce Cloudwatch cost induced by GetMetricData API calls. It defines indirectly how many alarm metrics
will be polled in a single Main Lambda execution. A dedicated algorithm is used to extrapolate missing data based
on previous GetMetricData API calls.

Reducing this value increase the accuracy of the scaling criteria and so, the reactivity of CloneSquad to a sudden burst of activity load but at the
expense of Cloudwatch.GetMetricData API cost.

This parameter does not influence the polling of user supplied alarms that are always polled at each run.
                        """
            },
            "cloudwatch.dashboard.use_default,Stable": {
                "DefaultValue":
                1,
                "Format":
                "Bool",
                "Description":
                """Enable or disable the Cloudwatch dashboard for CloneSquad.

The dashboard is enabled by default.
                        """
            },
            "cloudwatch.dashboard.update_interval": "hours=1",
            "cloudwatch.dashboard.snapshot_width": 1000,
            "cloudwatch.dashboard.snapshot_height": 400
        })
        Cfg.register({
            "cloudwatch.alarm00.configuration_url,Stable": {
                "DefaultValue":
                "",
                "Format":
                "MetaString",
                "Description":
                """Alarm specification to track for scaling decisions.

    Ex: internal:ec2.scaleup.alarm-cpu-gt-75pc.yaml,Points=1001,BaselineThreshold=30.0

See [Alarm specification documentation](ALARMS_REFERENCE.md)  for more details.
            """
            }
        })
        for i in range(1, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            Cfg.register({
                "cloudwatch.alarm%02d.configuration_url,Stable" % i: {
                    "DefaultValue":
                    "",
                    "Format":
                    "MetaString",
                    "Description":
                    """See `cloudwatch.alarm00.configuration_url`.
                """
                }
            })
        self.register_metric([{
            "MetricName": "Cloudwatch.GetMetricData",
            "Unit": "Count",
            "StorageResolution": 60
        }])

        self.ec2.register_state_aggregates([{
            "Prefix":
            "cloudwatch.dashboard.",
            "Compress":
            True,
            "DefaultTTL":
            Cfg.get_duration_secs("cloudwatch.default_ttl"),
            "Exclude": []
        }])
Exemple #14
0
    def get_prerequisites(self):
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        # Read all CloudWatch alarm templates into memory
        alarm_definitions = {}
        for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            key = "cloudwatch.alarm%02d.configuration_url" % (i)
            r = Cfg.get_extended(key)
            if not r["Success"] or r["Value"] == "":
                continue

            d = misc.parse_line_as_list_of_dict(r["Value"])
            url = d[0]["_"]
            meta = d[0]

            index = "%02d" % i
            alarm_defs = {
                "Index": index,
                "Key": key,
                "Url": url,
                "Definition": r,
                "Metadata": meta
            }

            prefix = "alarmname:"
            if url.startswith(prefix):
                alarm_defs["AlarmName"] = url[len(prefix):]
            else:
                log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"])
                try:
                    resp = misc.get_url(url.format(**self.context))
                    if resp is None:
                        raise Exception("URL content = <None>")
                    alarm_defs["Content"] = str(resp, "utf-8")
                except Exception as e:
                    log.exception("Failed to load Alarm definition '%s' : %e" %
                                  (r["Value"], e))
                    continue
            alarm_definitions[index] = alarm_defs

        self.alarm_definitions = alarm_definitions

        # Read all existing CloudWatch alarms
        alarms = []
        response = None
        while (response is None or "NextToken" in response):
            response = client.describe_alarms(MaxRecords=Cfg.get_int(
                "cloudwatch.describe_alarms.max_results"),
                                              NextToken=response["NextToken"]
                                              if response is not None else "")
            #log.debug(Dbg.pprint(response))
            for alarm in response["MetricAlarms"]:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is not None:
                    # This is an alarm thats belong to this CloneSquad instance
                    alarms.append(alarm)
        #log.debug(Dbg.pprint(alarms))
        self.alarms = alarms

        # Sanity check
        for index in self.alarm_definitions.keys():
            alarm_def = self.alarm_definitions[index]
            if "AlarmName" not in alarm_def:
                continue
            alarm = next(
                filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"],
                       self.alarms), None)
            if alarm is None:
                log.warning(
                    "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!"
                    % (alarm_def["Definition"]["Key"],
                       alarm_def["Definition"]["Value"],
                       alarm_def["Definition"]["Status"]))

        # Read all metrics associated with alarms

        # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the
        #    scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down
        #    to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds
        # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!!
        integration_period = Cfg.get_duration_secs(
            "ec2.schedule.horizontalscale.integration_period")
        instance_scale_score = self.ec2.get_integrated_float_state(
            "ec2.schedule.scaleout.instance_scale_score", integration_period)

        self.metric_cache = self.get_metric_cache()

        query = {"IdMapping": {}, "Queries": []}

        # Build query for Alarm metrics
        if Cfg.get("ec2.schedule.desired_instance_count") == "-1":
            # Sort by oldest alarms first in cache
            cached_metric_names = [m["_MetricId"] for m in self.metric_cache]
            valid_alarms = []
            for a in alarms:
                alarm_name = a["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None or alarm_def["AlarmDefinition"][
                        "Url"].startswith("alarmname:"):
                    continue
                a["_SamplingTime"] = self.get_metric_by_id(
                    alarm_name
                )["_SamplingTime"] if alarm_name in cached_metric_names else str(
                    misc.epoch())
                valid_alarms.append(a)
            sorted_alarms = sorted(
                valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"]))

            # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs
            time_for_full_metric_refresh = max(
                Cfg.get_duration_secs(
                    "cloudwatch.metrics.time_for_full_metric_refresh"), 1)
            app_run_period = Cfg.get_duration_secs("app.run_period")
            minimum_polled_alarms_per_run = Cfg.get_int(
                "cloudwatch.metrics.minimum_polled_alarms_per_run")
            maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh
            maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run,
                                                1.0)
            weight = min(instance_scale_score, maximum_polled_alarms_per_run)
            max_alarms_for_this_run = max(
                minimum_polled_alarms_per_run,
                int(min(weight, 1.0) * len(sorted_alarms)))
            for alarm in sorted_alarms[:max_alarms_for_this_run]:
                alarm_name = alarm["AlarmName"]
                CloudWatch._format_query(query, alarm_name, alarm)

            # We always poll user supplied alarms
            for alarm in alarms:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None:
                    continue  # Unknown alarm name
                if not alarm_def["AlarmDefinition"]["Url"].startswith(
                        "alarmname:"):
                    continue
                CloudWatch._format_query(query, alarm_name, alarm)

        # Query Metric for Burstable instances
        burstable_instances = self.ec2.get_burstable_instances(
            ScalingState="-error")
        last_collect_date = self.ec2.get_state_date(
            "cloudwatch.metrics.last_burstable_metric_collect_date")
        if last_collect_date is None or (now - last_collect_date) > timedelta(
                minutes=1):
            for i in burstable_instances:
                instance_id = i["InstanceId"]
                if not self.ec2.is_static_subfleet_instance(
                        instance_id) and self.ec2.get_scaling_state(
                            instance_id) == "excluded":
                    continue
                CloudWatch._format_query(
                    query, "%s/%s" % ("CPUCreditBalance", instance_id), {
                        "MetricName":
                        "CPUCreditBalance",
                        "Namespace":
                        "AWS/EC2",
                        "Dimensions": [{
                            "Name": "InstanceId",
                            "Value": instance_id
                        }],
                        "Period":
                        300,
                        "Statistic":
                        "Average"
                    })
            self.ec2.set_state(
                "cloudwatch.metrics.last_burstable_metric_collect_date",
                now,
                TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        # Make request to CloudWatch
        query_counter = self.ec2.get_state_int(
            "cloudwatch.metric.query_counter", default=0)
        queries = query["Queries"]
        metric_results = []
        metric_ids = []
        no_metric_ids = []
        while len(queries) > 0:
            q = queries[:500]
            queries = queries[500:]
            results = []
            response = None
            while response is None or "NextToken" in response:
                args = {
                    "MetricDataQueries":
                    q,
                    "StartTime":
                    now - timedelta(seconds=Cfg.get_duration_secs(
                        "cloudwatch.metrics.data_period")),
                    "EndTime":
                    now
                }
                if response is not None:
                    args["NextToken"] = response["NextToken"]
                response = client.get_metric_data(**args)
                results.extend(response["MetricDataResults"])
                query_counter += len(q)

            for r in results:
                if r["StatusCode"] != "Complete":
                    log.error("Failed to retrieve metrics: %s" % q)
                    continue
                metric_id = query["IdMapping"][r["Id"]]
                if len(r["Timestamps"]) == 0:
                    if metric_id not in no_metric_ids:
                        no_metric_ids.append(metric_id)
                    continue
                if metric_id not in metric_ids: metric_ids.append(metric_id)
                r["_MetricId"] = metric_id
                r["_SamplingTime"] = str(now)
                log.debug(r)
                metric_results.append(r)
        if len(no_metric_ids):
            log.info("No metrics returned for alarm '%s'" % no_metric_ids)

        # Merge with existing cache metric
        metric_cache = self.metric_cache
        self.metric_cache = metric_results
        for m in metric_cache:
            max_retention_period = Cfg.get_duration_secs(
                "cloudwatch.metrics.cache.max_retention_period")
            if m["_MetricId"] in metric_ids or "_SamplingTime" not in m:
                continue
            if (now - misc.str2utc(m["_SamplingTime"])
                ).total_seconds() < max_retention_period:
                self.metric_cache.append(m)

        self.ec2.set_state("cloudwatch.metric.query_counter",
                           query_counter,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.ec2.set_state_json(
            "cloudwatch.metrics.cache",
            self.metric_cache,
            TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.set_metric("Cloudwatch.GetMetricData", query_counter)

        # Augment Alarm definitions and Instances with associated metrics
        for metric in self.metric_cache:
            metric_id = metric["_MetricId"]

            alarm_data = self.get_alarm_data_by_name(metric_id)
            if alarm_data is not None:
                alarm_data["MetricDetails"] = metric
                continue

            instance = next(
                filter(
                    lambda i: "CPUCreditBalance/%s" % i["InstanceId"] ==
                    metric_id, burstable_instances), None)
            if instance is not None:
                instance["_Metrics"] = {}
                instance["_Metrics"]["CPUCreditBalance"] = metric
                continue
Exemple #15
0
    def handler(self, event, context):
        # Protect from bad data and keep only SNS messages
        if "Records" not in event:
            log.error("Not a valid SNS event")
            return

        sns_records = []
        for sns_msg in event["Records"]:
            if "EventSource" in sns_msg and sns_msg["EventSource"] == "aws:sns":
                try:
                    sns_msg["_decoded_message"] = json.loads(
                        sns_msg["Sns"]["Message"])
                    sns_records.append(sns_msg)
                except Exception as e:
                    log.exception("Failed to decode message %s : %s" %
                                  (sns_msg, e))

        log.debug(Dbg.pprint(sns_records))

        need_main_update = False

        # For each SNS records, we keep track of important data in
        #    a DynamoDB table
        for sns_msg in sns_records:
            message = sns_msg["_decoded_message"]
            timestamp = datetime.fromisoformat(
                message["StateChangeTime"].replace(
                    "+0000", "")).replace(tzinfo=timezone.utc)
            alarm_name = message["AlarmName"]
            new_state_reason = message["NewStateReason"]
            new_state_value = message["NewStateValue"]
            namespace = message["Trigger"]["Namespace"]
            metric_name = message["Trigger"]["MetricName"]
            dimensions = message["Trigger"]["Dimensions"]
            instance_id = "None"
            try:
                instance_id = next(
                    filter(lambda dimension: dimension['name'] == 'InstanceId',
                           message["Trigger"]["Dimensions"]))["value"]
            except Exception as e:
                log.exception(
                    "Failed to get InstanceId from dimension %s : %s" %
                    (message["Trigger"]["Dimensions"], e))
                continue

            now = misc.seconds_from_epoch_utc()

            response = self.context["dynamodb.client"].update_item(
                Key={"AlarmName": {
                    'S': alarm_name
                }},
                UpdateExpression=
                "set InstanceId=:instanceid, %s_LastAlarmTimeStamp=:timestamp, %s_LastNewStateReason=:lastnewstatereason,"
                "%s_LastMetricName=:lastmetricname, %s_LastMetricNamespace=:lastmetricnamespace, "
                "%s_Event=:event,"
                "ExpirationTime=:expirationtime,"
                "LastRecordUpdateTime=:lastrecordupdatetime" %
                (new_state_value, new_state_value, new_state_value,
                 new_state_value, new_state_value),
                ExpressionAttributeValues={
                    ':instanceid': {
                        'S': instance_id
                    },
                    ':timestamp': {
                        'S': str(timestamp)
                    },
                    ':lastnewstatereason': {
                        'S': new_state_reason
                    },
                    ':lastmetricname': {
                        'S': metric_name
                    },
                    ':lastmetricnamespace': {
                        'S': namespace
                    },
                    ':event': {
                        'S': json.dumps(message)
                    },
                    ':expirationtime': {
                        'N':
                        str(now + Cfg.get_duration_secs(
                            "snsmgr.record_expiration_delay"))
                    },
                    ':lastrecordupdatetime': {
                        'N': str(now)
                    }
                },
                ReturnConsumedCapacity='TOTAL',
                TableName=self.context["AlarmStateEC2Table"],
            )
            need_main_update = True

        if need_main_update:
            # Send a message to wakeup the Main Lambda function that is in
            #   charge to take appropriate decision
            sqs.call_me_back_send(self.ec2)
            log.debug("Sent SQS message to Main lambda queue: %s" %
                      self.context["MainSQSQueue"])
def main_handler_entrypoint(event, context):
    """

    Parameters
    ----------
    event: dict, required

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------

    """

    #print(Dbg.pprint(event))

    ctx["now"] = misc.utc_now()
    ctx["FunctionName"] = "Main"

    init()

    if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local():
        log.warning("Application disabled due to 'app.disable' key")
        return

    no_is_called_too_early = False
    # Manage Spot interruption as fast as we can
    if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx):
        log.info("Managed Spot Interruption SQS record!")
        # Force to run now disregarding `app.run_period` as we have at least one Spot instance to 
        #   remove from target groups immediatly
        no_is_called_too_early = True
    
    # Check that we are not called too early
    #   Note: We peform a direct read to the KVTable to spare initialization time when the
    #   Lambda is called too early
    ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True)
    if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": 
        ctx["main.last_call_date"] = str(misc.epoch())

    if not no_is_called_too_early and is_called_too_early():
        log.log(log.NOTICE, "Called too early by: %s" % event)
        notify.do_not_notify = True
        sqs.process_sqs_records(event)
        sqs.call_me_back_send()
        return

    log.debug("Load prerequisites.")
    load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"])

    # Remember 'now' as the last execution date
    ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl"))

    Cfg.dump()

    # Perform actions:
    log.debug("Main processing.")
    ctx["o_targetgroup"].manage_targetgroup()
    ctx["o_ec2_schedule"].schedule_instances()
    ctx["o_ec2_schedule"].stop_drained_instances()
    ctx["o_cloudwatch"].configure_alarms()
    ctx["o_rds"].manage_subfleet_rds()
    ctx["o_ec2_schedule"].prepare_metrics()

    ctx["o_cloudwatch"].send_metrics()
    ctx["o_cloudwatch"].configure_dashboard()

    # If we got woke up by SNS, acknowledge the message(s) now
    sqs.process_sqs_records(event)

    ctx["o_notify"].notify_user_arn_resources()

    # Call me back if needed
    sqs.call_me_back_send()
    def _manage_targetgroup(self, targetgroup, running_instances, transitions):
        now                = self.context["now"]
        registered_targets = self.get_registered_targets(targetgroup)[0]

        #  Generate events on instance state transition 
        for instance in self.ec2.get_instances(ScalingState="-excluded"):
            instance_id    = instance["InstanceId"]
            previous_state = self.get_instance_state(instance_id, targetgroup)
            if previous_state is None: previous_state = "None"
            target_instance = self.is_instance_registered(targetgroup, instance_id)
            current_state = target_instance['TargetHealth']["State"] if target_instance is not None else "None"
            if current_state != previous_state:
                transitions.append({
                        "InstanceId": instance_id,
                        "TargetGroupArn": targetgroup,
                        "PreviousState" : previous_state,
                        "NewState": current_state
                    })
            self.set_instance_state(instance_id, targetgroup, current_state)
        
        # List instances that are running and not yet in the TargetGroup
        instance_ids_to_add = []
        for instance in running_instances:
            instance_id = instance["InstanceId"]
            if self.ec2.get_scaling_state(instance_id) in ["draining", "bounced", "error"]:
                continue

            target_instance = self.is_instance_registered(targetgroup, instance_id)
            if target_instance is None:
                instance_ids_to_add.append({'Id':instance_id})
                self.set_instance_state(instance_id, targetgroup, "None")


        if len(instance_ids_to_add) > 0:
            log.debug("Registering instance(s) in TargetGroup: %s" % instance_ids_to_add)
            for instance_id in instance_ids_to_add:
                try:
                    response = R(lambda args, kwargs, r: r["ResponseMetadata"]["HTTPStatusCode"] == 200,
                        self.client_elbv2.register_targets, TargetGroupArn=targetgroup, Targets=[instance_id] 
                    )
                except Exception as e:
                    log.exception("Failed to register target '%s' in targetgroup '%s'!' : %s" % 
                            (instance_id, targetgroup["TargetGroupArn"], e))
            self.state_changed = True

        if self.state_changed:
            return

        # When there are instances in initial state, we have to react slower to
        #    misbehavior if 'initial' instances fail their health checks.
        slow_deregister = len(self.get_registered_instance_ids(state="initial")) != 0

        # List instances that are no more running but still in the TargetGroup
        delayed_deregister_instance_ids = []
        instance_ids_to_delete          = []
        draining_instances              = self.ec2.get_instances(ScalingState="excluded,draining,bounced,error")
        slow_deregister_timeout         = int(Cfg.get_duration_secs("targetgroup.slow_deregister_timeout"))
        for instance in registered_targets:
            instance_id = instance["Target"]["Id"]
            instance = self.ec2.get_instance_by_id(instance_id)

            if self.is_instance_registered(targetgroup, instance_id, fail_if_draining=True) is None:
                continue

            if instance is None or instance["State"]["Name"] not in ["pending","running"] or instance_id in self.ec2.get_instance_ids(draining_instances):
                meta                    = {}
                self.ec2.get_scaling_state(instance_id, meta=meta)
                if meta["last_action_date"] is not None and slow_deregister:
                    gap_secs                = (now - meta["last_action_date"]).total_seconds()
                    if gap_secs < (slow_deregister_timeout * random.random()):
                        if instance_id not in [ i["InstanceId"] for i in delayed_deregister_instance_ids]: 
                            delayed_deregister_instance_ids.append({
                            "InstanceId": instance_id,
                            "Gap": gap_secs
                            })
                        continue
                instance_ids_to_delete.append({'Id':instance_id})

        for i in delayed_deregister_instance_ids:
            log.info("Slow deregister mode: Instance '%s' is waiting deregister for %d seconds... (targetgroup.slow_deregister_timeout=%s + jitter...)" % 
                    (i["InstanceId"], i["Gap"], slow_deregister_timeout))


        if len(instance_ids_to_delete) > 0:
            log.debug("Deregistering instance(s) in TargetGroup: %s" % instance_ids_to_delete)
            response = R(lambda args, kwargs, r: r["ResponseMetadata"]["HTTPStatusCode"] == 200,
                self.client_elbv2.deregister_targets, TargetGroupArn=targetgroup, Targets=instance_ids_to_delete
            )
            self.state_changed = True
Exemple #18
0
    def __init__(self, context, o_state):
        self.context = context
        self.instances = None
        self.instance_ids = None
        self.instance_statuses = None
        self.prereqs_done = False
        self.o_state = o_state
        self.state_table = None

        Cfg.register({
            "ec2.describe_instances.max_results": "250",
            "ec2.describe_instance_types.enabled": "0",
            "ec2.az.statusmgt.disable": 0,
            "ec2.az.unavailable_list,Stable": {
                "DefaultValue":
                "",
                "Format":
                "StringList",
                "Description":
                """List of Availability Zone names (ex: *eu-west-3c*) or AZ Ids (ex: *euw3-az1*).

Typical usage is to force a fleet to consider one or more AZs as unavailable (AZ eviction). The autoscaler will then refuse to schedule
new instances on these AZs. Existing instances in those AZs are left unchanged but on scalein condition will be 
shutdown in priority (see [`ec2.az.evict_instances_when_az_faulty`](#ec2azinstance_faulty_when_az_faulty) to change this behavior). 

This setting can be used during an AWS LSE (Large Scale Event) to manually define that an AZ is unavailable.

> Note: CloneSquad also uses the EC2.describe_availability_zones() API to discover dynamically LSE events. So, setting directly this key
should not be needed in most cases.

Please notice that, once an AZ is enabled again (either manually or automatically), instance fleet WON'T be rebalanced automatically:
* If Instance bouncing is enabled, the fleet will be progressively rebalanced (convergence time will depend on the instance bouncing setting)
* If instance bouncing is not configured, user can force a rebalancing by switching temporarily the fleet to `100%` during few minutes 
(with [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) sets temporarily to `100%`) and switch back to the 
original value.

                     """
            },
            "ec2.az.evict_instances_when_az_faulty,Stable": {
                "DefaultValue":
                "0",
                "Format":
                "Bool",
                "Description":
                """Defines if instances running in a AZ with issues must be considered 'unavailable'

By Default, instances running in an AZ reported with issues are left untouched and these instances will only be evicted if
their invidual healthchecks fail or on scalein events.

Settting this parameter to 1 will force Clonesquad to consider all the instances running in faulty AZ as 'unavailable' and so
forcing their immediate replacement in healthy AZs in the region. 
                 """
            },
            "ec2.state.default_ttl": "days=1",
            "ec2.state.error_ttl": "minutes=5",
            "ec2.state.status_ttl": "days=40",
            "ec2.state.error_instance_ids": "",
            "ec2.state.excluded_instance_ids": {
                "DefaultValue":
                "",
                "Format":
                "List of String",
                "Description":
                """List of instance ids to consider as excluded.

                     One of the 2 ways to exclude existant instances to be managed by CloneSquad, this key is a list of instance ids (ex: 
                     i-077b2ae6988f33de4;i-0564c45bfa5bb6aa5). The other way to exclude instances, is to tag instances with "clonesquad:excluded" key
                     with value 'True'.
                     """
            },
            "ec2.debug.availability_zones_impaired": "",
        })

        self.o_state.register_aggregates([{
            "Prefix":
            "ec2.instance.",
            "Compress":
            True,
            "DefaultTTL":
            Cfg.get_duration_secs("ec2.state.default_ttl"),
            "Exclude": ["ec2.instance.scaling.state."]
        }])
Exemple #19
0
    def start_instances(self, instance_ids_to_start, max_started_instances=-1):
        # Remember when we tried to start all these instances. Used to detect instances with issues
        #    by placing them at end of get_instances() generated list
        if instance_ids_to_start is None or len(instance_ids_to_start) == 0:
            return
        now = self.context["now"]

        client = self.context["ec2.client"]
        for i in instance_ids_to_start:
            if max_started_instances == 0:
                break
            self.set_state("ec2.instance.last_start_attempt_date.%s" % i,
                           now,
                           TTL=Cfg.get_duration_secs("ec2.schedule.state_ttl"))

            log.info("Starting instance %s..." % i)
            response = None
            try:
                response = R(lambda args, kwargs, r: r["ResponseMetadata"][
                    "HTTPStatusCode"] == 200,
                             client.start_instances,
                             InstanceIds=[i])
            except Exception as e:
                log.exception(
                    "Got Exception while trying to start instance '%s' : %s" %
                    (i, e))
                # Mark the instance in error only if the status is not 'running'
                #   With Spot instances, from time-to-time, we catch an 'InsufficientCapacityError' even the
                #   instance succeeded to start. We issue a describe_instances to check the real state of this
                #   instance to confirm/infirm the status
                if response is not None:
                    response = R(
                        lambda args, kwargs, r: "Reservations" in r and len(
                            response["Reservations"]["Instances"]),
                        client.describe_instances,
                        InstanceIds=[i])
                if (response is None or "Reservations" not in response
                        or len(response["Reservations"][0]["Instances"]) == 0
                        or response["Reservations"][0]["Instances"][0]["State"]
                    ["Name"] not in ["pending", "running"]):
                    self.set_scaling_state(
                        i,
                        "error",
                        ttl=Cfg.get_duration_secs("ec2.state.error_ttl"))
                    continue
            if response is not None: log.debug(Dbg.pprint(response))

            # Remember when we started these instances
            metadata = response["ResponseMetadata"]
            if metadata["HTTPStatusCode"] == 200:
                s = response["StartingInstances"]
                for r in s:
                    instance_id = r["InstanceId"]
                    previous_state = r["PreviousState"]
                    current_state = r["CurrentState"]
                    if current_state["Name"] in ["pending", "running"]:
                        self.set_state(
                            "ec2.instance.last_start_date.%s" % instance_id,
                            now,
                            TTL=Cfg.get_duration_secs("ec2.state.status_ttl"))
                        max_started_instances -= 1
                    else:
                        log.error(
                            "Failed to start instance '%s'! Blacklist it for a while... (pre/current status=%s/%s)"
                            % (instance_id, previous_state["Name"],
                               current_state["Name"]))
                        self.set_scaling_state(
                            instance_id,
                            "error",
                            ttl=Cfg.get_duration_secs("ec2.state.error_ttl"))
                        R(None,
                          self.instance_in_error,
                          Operation="start",
                          InstanceId=instance_id,
                          PreviousState=previous_state["Name"],
                          CurrentState=current_state["Name"])
            else:
                log.error("Failed to call start_instances: %s" % i)
    def get_prerequisites(self):
        if Cfg.get_int("cron.disable"):
            return

        # Get Timezone related info
        self.timezones = yaml.safe_load(
            misc.get_url("internal:region-timezones.yaml"))
        self.tz = os.getenv("TimeZone")
        self.tz = self.timezones.get(self.context["AWS_DEFAULT_REGION"]) if (
            self.tz is None or self.tz == "") else self.tz
        self.tz = self.tz if self.tz else "UTC"
        self.local_now = arrow.now(
            self.tz)  # Get local time (with local timezone)
        self.utc_offset = self.local_now.utcoffset()
        self.dst_offset = self.local_now.dst()
        log.log(
            log.NOTICE,
            "Current timezone offset to UTC: %s, DST: %s, TimeZone: %s" %
            (self.utc_offset, self.dst_offset, self.tz))

        # Load scheduler KV table
        self.scheduler_table = kvtable.KVTable.create(
            self.context,
            self.context["SchedulerTable"],
            cache_max_age=Cfg.get_duration_secs("scheduler.cache.max_age"))

        # Compute event names
        self.load_event_definitions()

        # Read all existing event rules
        client = self.context["events.client"]
        params = {
            "NamePrefix": "CS-Cron-%s-" % (self.context["GroupName"]),
            "Limit": 10
        }
        self.rules = []
        paginator = client.get_paginator('list_rules')
        response_iterator = paginator.paginate(**params)
        for response in response_iterator:
            if "Rules" in response:
                self.rules.extend(response["Rules"])

        max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch")
        # Create missing rules
        expected_rule_names = [r["Name"] for r in self.event_names]
        existing_rule_names = [r["Name"] for r in self.rules]
        for r in expected_rule_names:
            if r not in existing_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                rule_def = self.get_ruledef_by_name(r)
                schedule_spec = rule_def["Data"][0]["schedule"]
                schedule_expression = self.process_cron_expression(
                    schedule_spec)
                log.log(
                    log.NOTICE,
                    f"Creating {r} {schedule_spec} => {schedule_expression}..."
                )

                # In order to remove burden on user, we perform a sanity check about a wellknown
                #    limitation of Cloudwatch.
                if schedule_expression.startswith("cron("):
                    expr = [
                        i
                        for i in schedule_expression.replace("(", " ").replace(
                            ")", " ").split(" ") if i != ""
                    ]
                    if len(expr) != 7:
                        log.warn(
                            "Schedule rule '%s' has an invalid cron expression '%s' (too short cron syntax)! Ignore it..."
                            % (rule_def["EventName"], schedule_expression))
                        continue
                    if (expr[5] != '?' and not expr[3] == '?') or (
                            expr[3] != '?' and not expr[5] == '?'):
                        log.warn(
                            "Schedule rule '%s' has an invalid cron expression '%s'. "
                            "You can't specify the Day-of-month and Day-of-week fields in the same cron expression. If you specify a value (or a *) in one of the fields, you must use a ? (question mark) in the other. "
                            "" % (rule_def["EventName"], schedule_expression))
                        continue

                # Update Cloudwatch rule
                try:
                    response = client.put_rule(
                        Name=r,
                        Description="Schedule Event '%s': %s" %
                        (rule_def["EventName"], rule_def["Event"]),
                        RoleArn=self.context["CloudWatchEventRoleArn"],
                        ScheduleExpression=schedule_expression,
                        State='ENABLED')
                    log.debug("put_rule: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to create scheduler event '%s' (%s) : %s" %
                        (r, schedule_expression, e))

                try:
                    response = client.put_targets(
                        Rule=r,
                        Targets=[{
                            'Arn': self.context["InteractLambdaArn"],
                            'Id': "id%s" % r,
                        }])
                    log.debug("put_targets: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to set targets for event rule '%s' : %s" %
                        (r, e))

        # Garbage collect obsolete rules
        for r in existing_rule_names:
            if r not in expected_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                try:
                    client.remove_targets(Rule=r, Ids=["id%s" % r])
                    client.delete_rule(Name=r)
                except Exception as e:
                    log.exception("Failed to delete rule '%s' : %s" % (r, e))
    def get_prerequisites(self):
        rds_client = self.context["rds.client"]
        tagging_client = self.context["resourcegroupstaggingapi.client"]

        self.databases = {"db": [], "cluster": []}
        for db_type in list(self.databases.keys()):
            paginator = tagging_client.get_paginator('get_resources')
            tag_mappings = itertools.chain.from_iterable(
                page['ResourceTagMappingList'] for page in paginator.paginate(
                    ResourceTypeFilters=["rds:%s" % db_type],
                    TagFilters=[{
                        'Key': 'clonesquad:group-name',
                        'Values': [self.context["GroupName"]]
                    }]))
            self.databases["%s.tags" % db_type] = list(tag_mappings)
            if len(self.databases["%s.tags" % db_type]) == 0:
                continue
            if db_type == "cluster":
                func = rds_client.describe_db_clusters
                filter_key = "db-cluster-id"
                response_index = "DBClusters"
            if db_type == "db":
                func = rds_client.describe_db_instances
                filter_key = "db-instance-id"
                response_index = "DBInstances"
            try:
                self.databases[db_type].extend(
                    func(Filters=[{
                        'Name':
                        filter_key,
                        'Values': [
                            t["ResourceARN"]
                            for t in self.databases["%s.tags" % db_type]
                        ]
                    }])[response_index])
            except Exception as e:
                log.exception("Failed to describe RDS database type '%s'" %
                              (db_type))

        #log.debug(Dbg.pprint(self.databases))

        Cfg.register({
            "rds.state.default_ttl": "hours=2",
            "rds.metrics.time_resolution": "60",
        })

        self.state_table = self.o_state.get_state_table()
        self.state_table.register_aggregates([{
            "Prefix":
            "rds.",
            "Compress":
            True,
            "DefaultTTL":
            Cfg.get_duration_secs("rds.state.default_ttl"),
            "Exclude": []
        }])

        metric_time_resolution = Cfg.get_int("rds.metrics.time_resolution")
        if metric_time_resolution < 60:
            metric_time_resolution = 1  # Switch to highest resolution
        self.cloudwatch.register_metric([
            {
                "MetricName": "StaticFleet.RDS.Size",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
            {
                "MetricName": "StaticFleet.RDS.AvailableDBs",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
            {
                "MetricName": "StaticFleet.RDS.StoppingDBs",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
            {
                "MetricName": "StaticFleet.RDS.StartingDBs",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
        ])

        # We need to register dynamically static subfleet configuration keys to avoid a 'key unknown' warning
        #   when the user is going to set it
        static_subfleet_names = self.get_rds_subfleet_names()
        for static_fleet in static_subfleet_names:
            key = "staticfleet.%s.state" % static_fleet
            if not Cfg.is_builtin_key_exist(key):
                Cfg.register({key: ""})
        log.log(
            log.NOTICE,
            "Detected following static subfleet names across RDS resources: %s"
            % static_subfleet_names)
    def get_prerequisites(self):
        """ Gather instance status by calling SSM APIs.
        """
        if not Cfg.get_int("ssm.enable"):
            log.log(log.NOTICE, "SSM support is currently disabled. Set ssm.enable to 1 to enabled it.")
            return
        now       = self.context["now"]
        self.ttl  = Cfg.get_duration_secs("ssm.state.default_ttl")
        GroupName = self.context["GroupName"]

        misc.initialize_clients(["ssm"], self.context)
        client = self.context["ssm.client"]

        # Retrive all SSM maintenace windows applicable to this CloneSquad deployment
        mw_names = {
            "__globaldefault__": {},
            "__default__": {},
            "__main__": {},
            "__all__":  {}
        }

        fmt                              = self.context.copy()
        mw_names["__globaldefault__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.global_defaults", fmt=fmt)
        mw_names["__default__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.defaults", fmt=fmt)
        mw_names["__main__"]["Names"]    = Cfg.get_list("ssm.feature.maintenance_window.mainfleet.defaults", fmt=fmt)
        mw_names["__all__"]["Names"]     = Cfg.get_list("ssm.feature.maintenance_window.subfleet.__all__.defaults", fmt=fmt)

        all_mw_names = mw_names["__globaldefault__"]["Names"]
        all_mw_names.extend([ n for n in mw_names["__default__"]["Names"] if n not in all_mw_names])
        all_mw_names.extend([ n for n in mw_names["__main__"]["Names"] if n not in all_mw_names])
        all_mw_names.extend([ n for n in mw_names["__all__"]["Names"] if n not in all_mw_names])

        Cfg.register({
                f"ssm.feature.maintenance_window.subfleet.__all__.force_running":
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"),
                f"ssm.feature.events.ec2.scaling_state_changes.draining.__main__.connection_refused_tcp_ports": 
                    Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports")
            })

        for SubfleetName in self.o_ec2.get_subfleet_names():
            fmt["SubfleetName"] = SubfleetName
            mw_names[f"Subfleet.{SubfleetName}"] = {}
            Cfg.register({
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults"),
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": 
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count"),
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running":
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"),
                f"ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleetName}.connection_refused_tcp_ports": 
                    Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports")
            })
            mw_names[f"Subfleet.{SubfleetName}"]["Names"] = Cfg.get_list(f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults", fmt=fmt)
            all_mw_names.extend([ n for n in mw_names[f"Subfleet.{SubfleetName}"]["Names"] if n not in all_mw_names])


        names = all_mw_names
        mws   = []
        while len(names):
            paginator = client.get_paginator('describe_maintenance_windows')
            response_iterator = paginator.paginate(
                Filters=[
                    {
                        'Key': 'Name',
                        'Values': names[:20]
                    },
                ])
            for r in response_iterator:
                for wi in r["WindowIdentities"]:
                    if not wi["Enabled"]:
                        log.log(log.NOTICE, f"SSM Maintenance Window '%s' not enabled. Ignored..." % wi["Name"])
                        continue
                    if "NextExecutionTime" not in wi:
                        log.log(log.NOTICE, f"/!\ SSM Maintenance Window '%s' without 'NextExecutionTime'." % wi["Name"])
                    if wi not in mws:
                        mws.append(wi)
            names = names[20:]
        # Make string dates as object dates
        for d in mws:
            if "NextExecutionTime" in d:
                d["NextExecutionTime"] = misc.str2utc(d["NextExecutionTime"])

        # Retrieve Maintenace Window tags with the resourcegroup API
        tagged_mws = self.context["o_state"].get_resources(service="ssm", resource_name="maintenancewindow")
        for tmw in tagged_mws:
            mw_id = tmw["ResourceARN"].split("/")[1]
            mw = next(filter(lambda w: w["WindowId"] == mw_id, mws), None)
            if mw:
                mw["Tags"] = tmw["Tags"]
        valid_mws = []
        for mw in mws:
            mw_id=mw["WindowId"]
            if "Tags" not in mw:
                try:
                    response   = client.list_tags_for_resource(ResourceType='MaintenanceWindow', ResourceId=mw_id)
                    mw["Tags"] = response['TagList'] if 'TagList' in response else []
                except Exception as e:
                    log.error(f"Failed to fetch Tags for MaintenanceWindow '{mw_id}'")
            if ("Tags" not in mw or not len(mw["Tags"])) and mw["Name"] not in mw_names["__globaldefault__"]["Names"]:
                log.warning(f"Please tag SSM Maintenance Window '%s/%s' with 'clonesquad:group-name': '%s'!" %
                        (mw["Name"], mw["WindowId"], self.context["GroupName"]))
                continue
            valid_mws.append(mw)
        
        self.maintenance_windows = {
            "Names": mw_names,
            "Windows": valid_mws
        }

        # Update asynchronous results from previously launched commands
        self.update_pending_command_statuses()

        # Perform maintenance window house keeping
        self.manage_maintenance_windows()
        if len(mws):
            log.log(log.NOTICE, f"Found matching SSM maintenance windows: %s" % self.maintenance_windows["Windows"])
       
        # Hard dependency toward EC2 module. We update the SSM instance initializing states
        self.o_ec2.update_ssm_initializing_states()
    def __init__(self, context):
        self.context                 = context
        self.o_state                 = self.context["o_state"]
        self.maintenance_windows     = {}
        self.o_ec2                   = self.context["o_ec2"]
        GroupName                    = self.context["GroupName"]

        Cfg.register({
            "ssm.enable,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Enable globally support for AWS System Manager by CloneSquad.

CloneSquad can leverage AWS SSM to take into account Maintenance Windows and use SSM RunCommand to execute status probe scripts located in managed instances.
            """
            },
            "ssm.feature.events.ec2.maintenance_window_period,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Enable/Disable sending Enter/Exit Maintenance Window period events to instances.

This enables event notification support of instances when they enter or exit a SSM Maintenance Window. When set to 1, CloneSquad sends a SSM RunCommand to run the script `/etc/cs-ssm/(enter|exit)-maintenance-window-period` script located in each instances. The event is repeasted until the script returns a zero-code. If the script doesn't exist on an instance, the event is sent only once.

> This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1.
            """
            },
            "ssm.feature.events.ec2.instance_ready_for_shutdown,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Ensure instance shutdown readiness with /etc/cs-ssm/instance-ready-for-shutdown script on SSM managed instances."

This enables support for direct sensing of instance shutdown readiness based on the return code of a script located in each EC2 instances. When set to 1, CloneSquad sends a SSM RunCommand to a managed instance candidate prior to shutdown: 
* If `/etc/cs-ssm/instance-ready-for-shutdown` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance shutdown and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds...
* If `/etc/cs-ssm/instance-ready-for-shutdown` is NOT present, immediate shutdown readyness is assumed.

> This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1.
            """
            },
             "ssm.feature.events.ec2.instance_ready_for_shutdown.max_shutdown_delay,Stable": {
                     "DefaultValue": "hours=1",
                     "Format": "Duration",
                     "Description": """ Maximum time to spend waiting for SSM based ready-for-shutdown status.

When SSM support is enabled with [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation), instances may notify CloneSquad when they are ready for shutdown. This setting defines
the maximum time spent by CloneSquad to receive this signal before to forcibly shutdown the instance.
                """
             },
            "ssm.feature.events.ec2.instance_ready_for_operation,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Ensure an instance go out from 'initializing' state based on an instance script returns code.

This enables support for direct sensing of instance **serving** readiness based on the return code of a script located in each EC2 instances. CloneSquad never stops an instance in the 'initializing' state. This state is normally automatically left after [`ec2.schedule.start.warmup_delay`](#ec2schedulestartwarmup_delay) seconds: When this setting is set, an SSM command is sent to each instance and call a script to get a direct ack that an instance can left the 'initializing' state.

* If `/etc/cs-ssm/instance-ready-for-operation` is present, it is executed with the SSM agent daemon user rights: If the script returns a NON-zero code, Clonesquad will postpone the instance go-out from 'initializing' state and will call this script again after 2 * [ `app.run_period`](#apprun_period) seconds...
* If `/etc/cs-ssm/instance-ready-for-operation` is NOT present, the instance leaves the 'initializing' state immediatly after 'warmup delay'..

> This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1.
            """
            },
            "ssm.feature.events.ec2.instance_ready_for_operation.max_initializing_time,Stable": {
                "DefaultValue": "hours=1",
                "Format": "Duration",
                "Description": """Max time that an instance can spend in 'initializing' state.

When [`ssm.feature.events.ec2.instance_ready_for_operation`](#ssmfeatureec2instance_ready_for_operation) is set, this setting defines the maximum duration that CloneSquas will attempt to get a status 'ready-for-operation' for a specific instance through SSM RunCommand calls and execution of the `/etc/cs-ssm/instance-ready-for-operation` script.
            """
            },
            "ssm.feature.events.ec2.scaling_state_changes,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Call a script in instance when the instance scaling state changes.

When this toggle set, the script `/etc/cs-ssm/instance-scaling-state-change` located into managed instances, is called to notify about a scaling status change. 
Currently, only `draining` and `bounced` events are sent (`bounced`is sent only if the instance bouncing feature is activated). For example, if an instance enters the `draining` state because CloneSquad wants to shutdown it, this event is called.

* If the script doesn't exists, the event is sent only once,
* If the script returns a non-zero code, the event will be repeated.

> Note: This event differs from [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) one as it is only meant to inform the instance about a status change. The [`ssm.feature.events.ec2.instance_ready_for_shutdown`](#ssmfeatureeventsec2instance_ready_for_shutdown) event is a request toward the instance asking for an approval to shutdown.

            """
            },
            "ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports,Stable": {
                "DefaultValue": "",
                "Format": "StringList",
                "Description": """On `draining` state, specified ports are blocked and so forbid new TCP connections (i.e. *Connection refused* message).

This features installs, **on `draining` time**, temporary iptables chain and rules denying new TCP connections to the specified port list.
This is useful, for example, to break a healthcheck life line as soon as an instance enters the `draining` state: It is especially useful when non-ELB LoadBalancers are used and CloneSquad does not know how to tell these loadbalancers that no more traffic needs to be sent to a drained instance. As it blocks only new TCP connections, currently active connections can terminate gracefully during the draining period.

> When instances are served only by CloneSquad managed ELB(s), there is no need to use this feature as CloneSquad will unregister the targets as soon as placed in `draining`state.

By default, no blocked port list is specified, so no iptables call is performed on the instance.
            """
            },
            "ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleeName}.connection_refused_tcp_ports,Stable": {
                "DefaultValue": "",
                "Format": "StringList",
                "Description": """Defines the blocked TCP port list for the specified fleet.

This setting overrides the value defined in [`ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports`](#ssmfeatureeventsec2scaling_state_changesdrainingconnection_refused_tcp_ports) for the specified fleet.

> Use `__main__` to designate the main fleet."""
            },
            "ssm.feature.events.ec2.instance_healthcheck": "0",
            "ssm.feature.maintenance_window,Stable": {
                "DefaultValue": "0",
                "Format": "Bool",
                "Description": """Defines if SSM maintenance window support is activated.

> This setting is taken into account only if [`ssm.enable`](#ssmenable) is set to 1.
            """
            },
            "ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running,Stable": {
                "DefaultValue": "1",
                "Format": "Bool",
                "Description": """Defines if a subfleet is forcibly set to 'running' when a maintenance window is actice.
        
By default, all the subfleets is woken up by a maintenance window ([`subfleet.{SubfleetName}.state`](#subfleetsubfleetnamestate) is temprarily forced to `running`).
            """,
            },
            "ssm.state.default_ttl": "hours=1",
            "ssm.state.command.default_ttl": "minutes=10",
            "ssm.state.command.result.default_ttl": "minutes=5",
            "ssm.feature.maintenance_window.start_ahead,Stable": {
                    "DefaultValue": "minutes=15",
                    "Format": "Duration",
                    "Description": """Start instances this specified time ahead of the next Maintenance Window.

In order to ensure that instances are up and ready when a SSM Maintenance Window starts, they are started in advance of the 'NextExecutionTime' defined in the SSM maintenance window object.
            """
            },
            "ssm.feature.maintenance_window.start_ahead.max_jitter": "66%",
            "ssm.feature.maintenance_window.global_defaults": "CS-GlobalDefaultMaintenanceWindow",
            "ssm.feature.maintenance_window.defaults": "CS-{GroupName}",
            "ssm.feature.maintenance_window.mainfleet.defaults": "CS-{GroupName}-Mainfleet",
            "ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count": {
                    "DefaultValue": "100%",
                    "Format": "IntegerOrPercentage",
                    "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs.

> Note: If this value is set to the special value '100%', the setting [`ec2.schedule.desired_instance_count`](#ec2scheduledesired_instance_count) is also forced to '100%'. This implies that any LightHouse instances will also be started and full fleet stability ensured during the Maintenance Window.
            """
            },
            "ssm.feature.maintenance_window.subfleet.__all__.defaults": "CS-{GroupName}-Subfleet.__all__",
            "ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": "CS-{GroupName}-Subfleet.{SubfleetName}",
            "ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": {
                    "DefaultValue": "100%",
                    "Format": "IntegerOrPercentage",
                    "Description": """Minimum number of instances serving in the fleet when the Maintenance Window occurs.

> Note: If this value is set to the special value '100%', the setting [`subfleet.{subfleet}.ec2.schedule.desired_instance_count`](#subfleetsubfleetec2scheduledesired_instance_count) is also forced to '100%' ensuring full subfleet stability.
            """
            },
            })

        self.o_state.register_aggregates([
            {
                "Prefix": "ssm.events",
                "Compress": True,
                "DefaultTTL": Cfg.get_duration_secs("ssm.state.default_ttl"),
                "Exclude" : []
            },
            ])
    def send_commands(self):        
        if not Cfg.get_int("ssm.enable"):
            return

        client = self.context["ssm.client"]
        refs   = {
            "Linux": {
                "document": "AWS-RunShellScript",
                "shell": [s.rstrip() for s in io.StringIO(str(misc.get_url("internal:cs-ssm-agent.sh"), "utf-8")).readlines()],
                "ids": [],
            }
        }
        # Purge already replied results
        valid_cmds = []
        for cmd in self.run_cmd_states["Commands"]:
            if cmd.get("Complete") or cmd["Expiration"] < misc.seconds_from_epoch_utc():
                continue
            valid_cmds.append(cmd)
        self.run_cmd_states["Commands"] = valid_cmds
        # Purge outdated former results
        former_results  = self.run_cmd_states["FormerResults"]
        for i in list(former_results.keys()):
            for cmd in list(former_results[i].keys()):
                if former_results[i][cmd]["Expiration"] < misc.seconds_from_epoch_utc():
                    del former_results[i][cmd]
            if len(former_results[i].keys()) == 0:
                del former_results[i]

        # Send commands
        for cmd in self.commands_to_send:
            platforms = {}
            for i in cmd["InstanceIds"]:
                info = self.is_instance_online(i)
                if info is None:
                    continue
                platform_type = info["PlatformType"]
                pltf          = refs.get(platform_type)
                if pltf is None:
                    log.warning("Can't run a command on an unsupported platform : %s" % info["PlatformType"])
                    continue # Unsupported platform
                if platform_type not in platforms:
                    platforms[platform_type] = copy.deepcopy(pltf)
                if i not in platforms[platform_type]["ids"]:
                    platforms[platform_type]["ids"].append(i)

            command = cmd["Command"]
            args    = cmd["CommandArgs"]
            for p in platforms:
                pltf         = platforms[p]
                instance_ids = pltf["ids"]
                if not len(instance_ids):
                    continue
                document     = pltf["document"]
                shell        = pltf["shell"]
                i_ids        = instance_ids
                # Perform string parameter substitutions in the helper script
                shell_input = [l.replace("##Cmd##", command) for l in shell]
                shell_input = [l.replace("##ApiGwUrl##", self.context["InteractAPIGWUrl"]) for l in shell_input]
                if isinstance(args, str):
                    shell_input = [l.replace("##Args##", args) for l in shell_input]
                else:
                    shell_input = [l.replace("##Args##", args["Args"] if "Args" in args else "") for l in shell_input]
                    for s in args:
                        shell_input = [l.replace(f"##{s}##", str(args[s])) for l in shell_input]

                while len(i_ids):
                    log.log(log.NOTICE, f"SSM SendCommand({p}): {command}({args}) to %s." % i_ids[:50])

                    try:
                        response = client.send_command(
                            InstanceIds=i_ids[:50],
                            DocumentName=document,
                            TimeoutSeconds=cmd["Timeout"],
                            Comment=cmd["Comment"],
                            Parameters={
                                'commands': shell_input,
                                'executionTimeout': [str(cmd["Timeout"])]
                            },
                            MaxConcurrency='100%',
                            MaxErrors='100%',
                            CloudWatchOutputConfig={
                                'CloudWatchLogGroupName': self.context["SSMLogGroup"],
                                'CloudWatchOutputEnabled': True
                            }
                        )
                        self.run_cmd_states["Commands"].append({
                            "Id": response["Command"]["CommandId"],
                            "InstanceIds": i_ids[:50],
                            "ReceivedInstanceIds": [],
                            "Command": command,
                            "CommandArgs": args,
                            "Results": {},
                            "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.default_ttl")
                        })
                        log.log(log.NOTICE, f"SSM RunCommand (Id:%s) : {command}({args})" % response["Command"]["CommandId"])
                    except Exception as e:
                        # Under rare circumstance, we can receive an Exception while trying to send
                        log.log(log.NOTICE, f"Failed to do SSM SendCommand : {e}, %s" % i_ids[:50])
                    i_ids = i_ids[50:]
        self.o_state.set_state_json("ssm.events.run_commands", self.run_cmd_states, compress=True, TTL=self.ttl)
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs):
    global records
    global notify_mgr
    record = {}
    record["EventType"] = f.__name__
    record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)}

    managed_exception = None
    xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__)
    try:
        r = f(*args, **kwargs)
        record["Output"] = json.dumps(r, default=str)
    except Exception as e:
        managed_exception = e
        record["Except"] = {
            "Exception": traceback.format_exc(),
            "Stackstrace": traceback.extract_stack(),
            "Reason": json.dumps(e, default=str)
        }
    xray_recorder.end_subsegment()

    if managed_exception is not None:
        # Persist now all aggregated data to not lose them
        xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" %
                                       f.__name__)
        try:
            KVTable.persist_aggregates()
        except Exception as e:
            log.exception("Failed to persist aggregated date!")
        xray_recorder.end_subsegment()

    if notify_mgr is None or do_not_notify:
        log.debug(
            "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s"
            % (notify_mgr, do_not_notify))
        if managed_exception is not None:
            raise managed_exception
        return r

    ctx = notify_mgr.context

    try:
        need_longterm_record = managed_exception is not None or not is_success_func(
            args, kwargs, r) if is_success_func is not None else False
    except Exception as e:
        log.exception(
            "Got an exception while assessing long term event management : %s"
            % e)
        need_longterm_record = True

    # Try to catch the maximum available metadata to ease later diagnosis
    #    Protect against exceptions to ensure proper logging
    record["Metadata"] = {}
    xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__)
    try:
        notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True)
        record["Metadata"]["EC2"] = {
            "AllInstanceDetails":
            notify_mgr.ec2.get_instances(),
            "AllInstanceStatuses":
            notify_mgr.ec2.get_instance_statuses(),
            "DrainingInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="draining")
            ],
            "BouncedInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="bounced")
            ],
            "ExcludedInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="excluded")
            ],
            "ErrorInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="error")
            ],
            "ScalingStates":
            notify_mgr.ec2.get_all_scaling_states()
        }
    except Exception as e:
        log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e)
    xray_recorder.end_subsegment()
    xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" %
                                   f.__name__)
    try:
        notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True)
        record["Metadata"][
            "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info()
    except Exception as e:
        log.exception(
            'Failed to create record["Metadata"]["TargetGroups"] : %s' % e)
    xray_recorder.end_subsegment()

    for key in ["Metadata"]:
        zipped_bytes = gzip.compress(
            bytes(json.dumps(record[key], default=str), "utf-8"))
        record[key] = str(base64.b64encode(zipped_bytes), "utf-8")

    now = misc.utc_now()
    now_seconds = misc.seconds_from_epoch_utc()
    max_longterm_records = Cfg.get_int("notify.event.longterm.max_records")
    if max_longterm_records <= 0:
        need_longterm_record = 0

    tables = [
        {
            "Name": ctx["EventTable"],
            "NeedWrite": need_shortterm_record,
            "TTL": Cfg.get_duration_secs("notify.event.default_ttl"),
            "DBImages": False,
            "DebugReport": False
        },
        {
            "Name": ctx["LongTermEventTable"],
            "NeedWrite": need_longterm_record,
            "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"),
            "DBImages": True,
            "DebugReport": True
        },
    ]
    xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__)
    for table in tables:
        if not table["NeedWrite"]:
            continue
        UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, "
        UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime"
        ExpressionAttributeValues = {
            ':entrypoint': {
                'S': ctx["FunctionName"]
            },
            ':eventtype': {
                'S': record["EventType"]
            },
            ':input': {
                'S': json.dumps(record["Input"], default=str)
            },
            ':output': {
                'S':
                json.dumps(record["Output"] if "Output" in record else {},
                           default=str)
            },
            ':exception': {
                'S':
                json.dumps(record["Except"] if "Except" in record else "",
                           default=str)
            },
            ':metadata': {
                'S': json.dumps(record["Metadata"], default=str)
            },
            ':expirationtime': {
                'N': str(now_seconds + table["TTL"])
            }
        }
        if table["DBImages"]:
            # Insert snapshots of the CloudWatch dashboard
            try:
                log.log(log.NOTICE,
                        "Generating snapshots for Dashboard graphs...")
                images = notify_mgr.cloudwatch.get_dashboard_images()
                for i in images:
                    compressed_name = i.replace(" ", "")
                    UpdateExpression += ", Graph_%s_PNG=:graph%s" % (
                        compressed_name, compressed_name)
                    ExpressionAttributeValues[":graph%s" % compressed_name] = {
                        'S': images[i]
                    }
                log.info(
                    "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!"
                    % table["Name"])
            except Exception as e:
                log.exception(
                    "Failed to retrieve CloudWatch snapshot images! : %s" % e)

        response = ctx["dynamodb.client"].update_item(
            Key={"EventDate": {
                'S': str(now)
            }},
            UpdateExpression=UpdateExpression,
            ExpressionAttributeValues=ExpressionAttributeValues,
            ReturnConsumedCapacity='TOTAL',
            TableName=table["Name"],
        )

        log.debug(Dbg.pprint(response))
        log.log(
            log.NOTICE, "Written event '[%s] %s' to table '%s'." %
            (str(now), record["EventType"], table["Name"]))

        # Keep under control the number of LongTerm items stored in DynamoDB table
        if need_longterm_record:
            longterm_item_eventdates = [
                m["_"] for m in notify_mgr.state.get_metastring_list(
                    "notify.longterm.itemlist", default=[])
            ]
            log.log(log.NOTICE,
                    "Guessed number of records in LongTerm Event table : %d",
                    len(longterm_item_eventdates))
            longterm_item_eventdates.append(str(now))
            nb_records_to_delete = max(
                len(longterm_item_eventdates) - max_longterm_records, 0)
            for eventdate in longterm_item_eventdates[:nb_records_to_delete]:
                try:
                    response = ctx["dynamodb.client"].delete_item(
                        Key={'EventDate': {
                            'S': eventdate
                        }},
                        TableName=ctx["LongTermEventTable"])
                    log.debug(response)
                    log.log(
                        log.NOTICE,
                        "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)"
                        % (eventdate, max_longterm_records))
                except Exception as e:
                    log.exception(
                        "Got exception while deleting LongTerm record '%s' : %e"
                        % (eventdate, e))
            notify_mgr.state.set_state(
                "notify.longterm.itemlist",
                ";".join(longterm_item_eventdates[nb_records_to_delete:]),
                TTL=Cfg.get_duration_secs("notify.event.longterm.ttl"))
            try:
                KVTable.persist_aggregates()
            except Exception as e:
                log.exception("Got exception while persisting KVTables : %s" %
                              e)

        # Manage Debug report export to S3
        url = ctx["LoggingS3Path"]
        if url != "" and table["DebugReport"] and Cfg.get_int(
                "notify.debug.send_s3_reports"):
            xray_recorder.begin_subsegment(
                "notifycall-publish_all_reports:%s" % f.__name__)
            if ctx["FunctionName"] == "Interact":
                # Avoid recursion if throwing from InteractFunction
                log.info("Publishing Debug reports synchronously...")
                debug.publish_all_reports(ctx, url, "notifymgr_report")
            else:
                client = ctx["sqs.client"]
                log.info(
                    "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..."
                    % ctx["InteractSQSUrl"])
                response = client.send_message(QueueUrl=ctx["InteractSQSUrl"],
                                               MessageBody=json.dumps({
                                                   "OpType":
                                                   "Debug/PublishReportNow",
                                                   "Events": {
                                                       "Timestamp":
                                                       str(ctx["now"])
                                                   }
                                               }))
                log.debug(response)
            xray_recorder.end_subsegment()

    xray_recorder.end_subsegment()
    if managed_exception is not None:
        raise managed_exception
    return r
Exemple #26
0
 def save_cached_data(self, data):
     d = misc.encode_json(data, compress=True)
     self.context["o_state"].set_state(
         "interact.precomputed",
         d,
         TTL=max(Cfg.get_duration_secs("app.run_period") * 2, 240))