Exemple #1
0
 def get_scaling_state(self,
                       instance_id,
                       default=None,
                       meta=None,
                       default_date=None,
                       do_not_return_excluded=False):
     if meta is not None:
         for i in ["action", "draining", "error", "bounced"]:
             meta["last_%s_date" % i] = misc.str2utc(
                 self.get_state("ec2.instance.scaling.last_%s_date.%s" %
                                (i, instance_id),
                                default=self.context["now"]))
     r = self.get_state("ec2.instance.scaling.state.%s" % instance_id,
                        default=default)
     #Special case for 'excluded': We test it here so tags will override the value
     i = self.get_instance_by_id(instance_id)
     excluded_instances = Cfg.get_list("ec2.state.excluded_instance_ids",
                                       default=[])
     if (i is not None and not do_not_return_excluded
             and (self.instance_has_tag(
                 i, "clonesquad:excluded", value=["1", "True", "true"])
                  or i in excluded_instances
                  or self.is_static_subfleet_instance(instance_id))):
         r = "excluded"
     # Force error state for some VM (debug usage)
     error_instance_ids = Cfg.get_list("ec2.state.error_instance_ids",
                                       default=[])
     if instance_id in error_instance_ids:
         r = "error"
     return r
Exemple #2
0
    def _decode_integrate_float(self, key, integration_period):
        now = self.context["now"]
        v = self.get_state(key, None)
        if v is None:
            records = []
        else:
            records = v.split(";")

        recs = []
        for r in records:
            sp = r.split("=")
            try:
                d = misc.str2utc(sp[0])
                v = float(sp[1])
                if now - d < timedelta(seconds=integration_period):
                    recs.append(["%s=%s" % (d, v), d, v])
            except:
                pass
        return recs
Exemple #3
0
    def get_prerequisites(self):
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        # Read all CloudWatch alarm templates into memory
        alarm_definitions = {}
        for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            key = "cloudwatch.alarm%02d.configuration_url" % (i)
            r = Cfg.get_extended(key)
            if not r["Success"] or r["Value"] == "":
                continue

            d = misc.parse_line_as_list_of_dict(r["Value"])
            url = d[0]["_"]
            meta = d[0]

            index = "%02d" % i
            alarm_defs = {
                "Index": index,
                "Key": key,
                "Url": url,
                "Definition": r,
                "Metadata": meta
            }

            prefix = "alarmname:"
            if url.startswith(prefix):
                alarm_defs["AlarmName"] = url[len(prefix):]
            else:
                log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"])
                try:
                    resp = misc.get_url(url.format(**self.context))
                    if resp is None:
                        raise Exception("URL content = <None>")
                    alarm_defs["Content"] = str(resp, "utf-8")
                except Exception as e:
                    log.exception("Failed to load Alarm definition '%s' : %e" %
                                  (r["Value"], e))
                    continue
            alarm_definitions[index] = alarm_defs

        self.alarm_definitions = alarm_definitions

        # Read all existing CloudWatch alarms
        alarms = []
        response = None
        while (response is None or "NextToken" in response):
            response = client.describe_alarms(MaxRecords=Cfg.get_int(
                "cloudwatch.describe_alarms.max_results"),
                                              NextToken=response["NextToken"]
                                              if response is not None else "")
            #log.debug(Dbg.pprint(response))
            for alarm in response["MetricAlarms"]:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is not None:
                    # This is an alarm thats belong to this CloneSquad instance
                    alarms.append(alarm)
        #log.debug(Dbg.pprint(alarms))
        self.alarms = alarms

        # Sanity check
        for index in self.alarm_definitions.keys():
            alarm_def = self.alarm_definitions[index]
            if "AlarmName" not in alarm_def:
                continue
            alarm = next(
                filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"],
                       self.alarms), None)
            if alarm is None:
                log.warning(
                    "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!"
                    % (alarm_def["Definition"]["Key"],
                       alarm_def["Definition"]["Value"],
                       alarm_def["Definition"]["Status"]))

        # Read all metrics associated with alarms

        # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the
        #    scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down
        #    to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds
        # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!!
        integration_period = Cfg.get_duration_secs(
            "ec2.schedule.horizontalscale.integration_period")
        instance_scale_score = self.ec2.get_integrated_float_state(
            "ec2.schedule.scaleout.instance_scale_score", integration_period)

        self.metric_cache = self.get_metric_cache()

        query = {"IdMapping": {}, "Queries": []}

        # Build query for Alarm metrics
        if Cfg.get("ec2.schedule.desired_instance_count") == "-1":
            # Sort by oldest alarms first in cache
            cached_metric_names = [m["_MetricId"] for m in self.metric_cache]
            valid_alarms = []
            for a in alarms:
                alarm_name = a["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None or alarm_def["AlarmDefinition"][
                        "Url"].startswith("alarmname:"):
                    continue
                a["_SamplingTime"] = self.get_metric_by_id(
                    alarm_name
                )["_SamplingTime"] if alarm_name in cached_metric_names else str(
                    misc.epoch())
                valid_alarms.append(a)
            sorted_alarms = sorted(
                valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"]))

            # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs
            time_for_full_metric_refresh = max(
                Cfg.get_duration_secs(
                    "cloudwatch.metrics.time_for_full_metric_refresh"), 1)
            app_run_period = Cfg.get_duration_secs("app.run_period")
            minimum_polled_alarms_per_run = Cfg.get_int(
                "cloudwatch.metrics.minimum_polled_alarms_per_run")
            maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh
            maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run,
                                                1.0)
            weight = min(instance_scale_score, maximum_polled_alarms_per_run)
            max_alarms_for_this_run = max(
                minimum_polled_alarms_per_run,
                int(min(weight, 1.0) * len(sorted_alarms)))
            for alarm in sorted_alarms[:max_alarms_for_this_run]:
                alarm_name = alarm["AlarmName"]
                CloudWatch._format_query(query, alarm_name, alarm)

            # We always poll user supplied alarms
            for alarm in alarms:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None:
                    continue  # Unknown alarm name
                if not alarm_def["AlarmDefinition"]["Url"].startswith(
                        "alarmname:"):
                    continue
                CloudWatch._format_query(query, alarm_name, alarm)

        # Query Metric for Burstable instances
        burstable_instances = self.ec2.get_burstable_instances(
            ScalingState="-error")
        last_collect_date = self.ec2.get_state_date(
            "cloudwatch.metrics.last_burstable_metric_collect_date")
        if last_collect_date is None or (now - last_collect_date) > timedelta(
                minutes=1):
            for i in burstable_instances:
                instance_id = i["InstanceId"]
                if not self.ec2.is_static_subfleet_instance(
                        instance_id) and self.ec2.get_scaling_state(
                            instance_id) == "excluded":
                    continue
                CloudWatch._format_query(
                    query, "%s/%s" % ("CPUCreditBalance", instance_id), {
                        "MetricName":
                        "CPUCreditBalance",
                        "Namespace":
                        "AWS/EC2",
                        "Dimensions": [{
                            "Name": "InstanceId",
                            "Value": instance_id
                        }],
                        "Period":
                        300,
                        "Statistic":
                        "Average"
                    })
            self.ec2.set_state(
                "cloudwatch.metrics.last_burstable_metric_collect_date",
                now,
                TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        # Make request to CloudWatch
        query_counter = self.ec2.get_state_int(
            "cloudwatch.metric.query_counter", default=0)
        queries = query["Queries"]
        metric_results = []
        metric_ids = []
        no_metric_ids = []
        while len(queries) > 0:
            q = queries[:500]
            queries = queries[500:]
            results = []
            response = None
            while response is None or "NextToken" in response:
                args = {
                    "MetricDataQueries":
                    q,
                    "StartTime":
                    now - timedelta(seconds=Cfg.get_duration_secs(
                        "cloudwatch.metrics.data_period")),
                    "EndTime":
                    now
                }
                if response is not None:
                    args["NextToken"] = response["NextToken"]
                response = client.get_metric_data(**args)
                results.extend(response["MetricDataResults"])
                query_counter += len(q)

            for r in results:
                if r["StatusCode"] != "Complete":
                    log.error("Failed to retrieve metrics: %s" % q)
                    continue
                metric_id = query["IdMapping"][r["Id"]]
                if len(r["Timestamps"]) == 0:
                    if metric_id not in no_metric_ids:
                        no_metric_ids.append(metric_id)
                    continue
                if metric_id not in metric_ids: metric_ids.append(metric_id)
                r["_MetricId"] = metric_id
                r["_SamplingTime"] = str(now)
                log.debug(r)
                metric_results.append(r)
        if len(no_metric_ids):
            log.info("No metrics returned for alarm '%s'" % no_metric_ids)

        # Merge with existing cache metric
        metric_cache = self.metric_cache
        self.metric_cache = metric_results
        for m in metric_cache:
            max_retention_period = Cfg.get_duration_secs(
                "cloudwatch.metrics.cache.max_retention_period")
            if m["_MetricId"] in metric_ids or "_SamplingTime" not in m:
                continue
            if (now - misc.str2utc(m["_SamplingTime"])
                ).total_seconds() < max_retention_period:
                self.metric_cache.append(m)

        self.ec2.set_state("cloudwatch.metric.query_counter",
                           query_counter,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.ec2.set_state_json(
            "cloudwatch.metrics.cache",
            self.metric_cache,
            TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.set_metric("Cloudwatch.GetMetricData", query_counter)

        # Augment Alarm definitions and Instances with associated metrics
        for metric in self.metric_cache:
            metric_id = metric["_MetricId"]

            alarm_data = self.get_alarm_data_by_name(metric_id)
            if alarm_data is not None:
                alarm_data["MetricDetails"] = metric
                continue

            instance = next(
                filter(
                    lambda i: "CPUCreditBalance/%s" % i["InstanceId"] ==
                    metric_id, burstable_instances), None)
            if instance is not None:
                instance["_Metrics"] = {}
                instance["_Metrics"]["CPUCreditBalance"] = metric
                continue
def get_date(key, default=None):
    v = get(key)
    if v is None: return default
    return misc.str2utc(v, default=default)
    def get_prerequisites(self):
        """ Gather instance status by calling SSM APIs.
        """
        if not Cfg.get_int("ssm.enable"):
            log.log(log.NOTICE, "SSM support is currently disabled. Set ssm.enable to 1 to enabled it.")
            return
        now       = self.context["now"]
        self.ttl  = Cfg.get_duration_secs("ssm.state.default_ttl")
        GroupName = self.context["GroupName"]

        misc.initialize_clients(["ssm"], self.context)
        client = self.context["ssm.client"]

        # Retrive all SSM maintenace windows applicable to this CloneSquad deployment
        mw_names = {
            "__globaldefault__": {},
            "__default__": {},
            "__main__": {},
            "__all__":  {}
        }

        fmt                              = self.context.copy()
        mw_names["__globaldefault__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.global_defaults", fmt=fmt)
        mw_names["__default__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.defaults", fmt=fmt)
        mw_names["__main__"]["Names"]    = Cfg.get_list("ssm.feature.maintenance_window.mainfleet.defaults", fmt=fmt)
        mw_names["__all__"]["Names"]     = Cfg.get_list("ssm.feature.maintenance_window.subfleet.__all__.defaults", fmt=fmt)

        all_mw_names = mw_names["__globaldefault__"]["Names"]
        all_mw_names.extend([ n for n in mw_names["__default__"]["Names"] if n not in all_mw_names])
        all_mw_names.extend([ n for n in mw_names["__main__"]["Names"] if n not in all_mw_names])
        all_mw_names.extend([ n for n in mw_names["__all__"]["Names"] if n not in all_mw_names])

        Cfg.register({
                f"ssm.feature.maintenance_window.subfleet.__all__.force_running":
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"),
                f"ssm.feature.events.ec2.scaling_state_changes.draining.__main__.connection_refused_tcp_ports": 
                    Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports")
            })

        for SubfleetName in self.o_ec2.get_subfleet_names():
            fmt["SubfleetName"] = SubfleetName
            mw_names[f"Subfleet.{SubfleetName}"] = {}
            Cfg.register({
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults"),
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": 
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count"),
                f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running":
                    Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"),
                f"ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleetName}.connection_refused_tcp_ports": 
                    Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports")
            })
            mw_names[f"Subfleet.{SubfleetName}"]["Names"] = Cfg.get_list(f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults", fmt=fmt)
            all_mw_names.extend([ n for n in mw_names[f"Subfleet.{SubfleetName}"]["Names"] if n not in all_mw_names])


        names = all_mw_names
        mws   = []
        while len(names):
            paginator = client.get_paginator('describe_maintenance_windows')
            response_iterator = paginator.paginate(
                Filters=[
                    {
                        'Key': 'Name',
                        'Values': names[:20]
                    },
                ])
            for r in response_iterator:
                for wi in r["WindowIdentities"]:
                    if not wi["Enabled"]:
                        log.log(log.NOTICE, f"SSM Maintenance Window '%s' not enabled. Ignored..." % wi["Name"])
                        continue
                    if "NextExecutionTime" not in wi:
                        log.log(log.NOTICE, f"/!\ SSM Maintenance Window '%s' without 'NextExecutionTime'." % wi["Name"])
                    if wi not in mws:
                        mws.append(wi)
            names = names[20:]
        # Make string dates as object dates
        for d in mws:
            if "NextExecutionTime" in d:
                d["NextExecutionTime"] = misc.str2utc(d["NextExecutionTime"])

        # Retrieve Maintenace Window tags with the resourcegroup API
        tagged_mws = self.context["o_state"].get_resources(service="ssm", resource_name="maintenancewindow")
        for tmw in tagged_mws:
            mw_id = tmw["ResourceARN"].split("/")[1]
            mw = next(filter(lambda w: w["WindowId"] == mw_id, mws), None)
            if mw:
                mw["Tags"] = tmw["Tags"]
        valid_mws = []
        for mw in mws:
            mw_id=mw["WindowId"]
            if "Tags" not in mw:
                try:
                    response   = client.list_tags_for_resource(ResourceType='MaintenanceWindow', ResourceId=mw_id)
                    mw["Tags"] = response['TagList'] if 'TagList' in response else []
                except Exception as e:
                    log.error(f"Failed to fetch Tags for MaintenanceWindow '{mw_id}'")
            if ("Tags" not in mw or not len(mw["Tags"])) and mw["Name"] not in mw_names["__globaldefault__"]["Names"]:
                log.warning(f"Please tag SSM Maintenance Window '%s/%s' with 'clonesquad:group-name': '%s'!" %
                        (mw["Name"], mw["WindowId"], self.context["GroupName"]))
                continue
            valid_mws.append(mw)
        
        self.maintenance_windows = {
            "Names": mw_names,
            "Windows": valid_mws
        }

        # Update asynchronous results from previously launched commands
        self.update_pending_command_statuses()

        # Perform maintenance window house keeping
        self.manage_maintenance_windows()
        if len(mws):
            log.log(log.NOTICE, f"Found matching SSM maintenance windows: %s" % self.maintenance_windows["Windows"])
       
        # Hard dependency toward EC2 module. We update the SSM instance initializing states
        self.o_ec2.update_ssm_initializing_states()
def seconds_since_last_call():
    if "main.last_call_date" not in ctx:
        return 0
    return (misc.utc_now() - misc.str2utc(
        ctx["main.last_call_date"], default=misc.epoch())).total_seconds()