def get_scaling_state(self, instance_id, default=None, meta=None, default_date=None, do_not_return_excluded=False): if meta is not None: for i in ["action", "draining", "error", "bounced"]: meta["last_%s_date" % i] = misc.str2utc( self.get_state("ec2.instance.scaling.last_%s_date.%s" % (i, instance_id), default=self.context["now"])) r = self.get_state("ec2.instance.scaling.state.%s" % instance_id, default=default) #Special case for 'excluded': We test it here so tags will override the value i = self.get_instance_by_id(instance_id) excluded_instances = Cfg.get_list("ec2.state.excluded_instance_ids", default=[]) if (i is not None and not do_not_return_excluded and (self.instance_has_tag( i, "clonesquad:excluded", value=["1", "True", "true"]) or i in excluded_instances or self.is_static_subfleet_instance(instance_id))): r = "excluded" # Force error state for some VM (debug usage) error_instance_ids = Cfg.get_list("ec2.state.error_instance_ids", default=[]) if instance_id in error_instance_ids: r = "error" return r
def _decode_integrate_float(self, key, integration_period): now = self.context["now"] v = self.get_state(key, None) if v is None: records = [] else: records = v.split(";") recs = [] for r in records: sp = r.split("=") try: d = misc.str2utc(sp[0]) v = float(sp[1]) if now - d < timedelta(seconds=integration_period): recs.append(["%s=%s" % (d, v), d, v]) except: pass return recs
def get_prerequisites(self): now = self.context["now"] client = self.context["cloudwatch.client"] # Read all CloudWatch alarm templates into memory alarm_definitions = {} for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")): key = "cloudwatch.alarm%02d.configuration_url" % (i) r = Cfg.get_extended(key) if not r["Success"] or r["Value"] == "": continue d = misc.parse_line_as_list_of_dict(r["Value"]) url = d[0]["_"] meta = d[0] index = "%02d" % i alarm_defs = { "Index": index, "Key": key, "Url": url, "Definition": r, "Metadata": meta } prefix = "alarmname:" if url.startswith(prefix): alarm_defs["AlarmName"] = url[len(prefix):] else: log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"]) try: resp = misc.get_url(url.format(**self.context)) if resp is None: raise Exception("URL content = <None>") alarm_defs["Content"] = str(resp, "utf-8") except Exception as e: log.exception("Failed to load Alarm definition '%s' : %e" % (r["Value"], e)) continue alarm_definitions[index] = alarm_defs self.alarm_definitions = alarm_definitions # Read all existing CloudWatch alarms alarms = [] response = None while (response is None or "NextToken" in response): response = client.describe_alarms(MaxRecords=Cfg.get_int( "cloudwatch.describe_alarms.max_results"), NextToken=response["NextToken"] if response is not None else "") #log.debug(Dbg.pprint(response)) for alarm in response["MetricAlarms"]: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is not None: # This is an alarm thats belong to this CloneSquad instance alarms.append(alarm) #log.debug(Dbg.pprint(alarms)) self.alarms = alarms # Sanity check for index in self.alarm_definitions.keys(): alarm_def = self.alarm_definitions[index] if "AlarmName" not in alarm_def: continue alarm = next( filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"], self.alarms), None) if alarm is None: log.warning( "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!" % (alarm_def["Definition"]["Key"], alarm_def["Definition"]["Value"], alarm_def["Definition"]["Status"])) # Read all metrics associated with alarms # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the # scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down # to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!! integration_period = Cfg.get_duration_secs( "ec2.schedule.horizontalscale.integration_period") instance_scale_score = self.ec2.get_integrated_float_state( "ec2.schedule.scaleout.instance_scale_score", integration_period) self.metric_cache = self.get_metric_cache() query = {"IdMapping": {}, "Queries": []} # Build query for Alarm metrics if Cfg.get("ec2.schedule.desired_instance_count") == "-1": # Sort by oldest alarms first in cache cached_metric_names = [m["_MetricId"] for m in self.metric_cache] valid_alarms = [] for a in alarms: alarm_name = a["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None or alarm_def["AlarmDefinition"][ "Url"].startswith("alarmname:"): continue a["_SamplingTime"] = self.get_metric_by_id( alarm_name )["_SamplingTime"] if alarm_name in cached_metric_names else str( misc.epoch()) valid_alarms.append(a) sorted_alarms = sorted( valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"])) # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs time_for_full_metric_refresh = max( Cfg.get_duration_secs( "cloudwatch.metrics.time_for_full_metric_refresh"), 1) app_run_period = Cfg.get_duration_secs("app.run_period") minimum_polled_alarms_per_run = Cfg.get_int( "cloudwatch.metrics.minimum_polled_alarms_per_run") maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run, 1.0) weight = min(instance_scale_score, maximum_polled_alarms_per_run) max_alarms_for_this_run = max( minimum_polled_alarms_per_run, int(min(weight, 1.0) * len(sorted_alarms))) for alarm in sorted_alarms[:max_alarms_for_this_run]: alarm_name = alarm["AlarmName"] CloudWatch._format_query(query, alarm_name, alarm) # We always poll user supplied alarms for alarm in alarms: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None: continue # Unknown alarm name if not alarm_def["AlarmDefinition"]["Url"].startswith( "alarmname:"): continue CloudWatch._format_query(query, alarm_name, alarm) # Query Metric for Burstable instances burstable_instances = self.ec2.get_burstable_instances( ScalingState="-error") last_collect_date = self.ec2.get_state_date( "cloudwatch.metrics.last_burstable_metric_collect_date") if last_collect_date is None or (now - last_collect_date) > timedelta( minutes=1): for i in burstable_instances: instance_id = i["InstanceId"] if not self.ec2.is_static_subfleet_instance( instance_id) and self.ec2.get_scaling_state( instance_id) == "excluded": continue CloudWatch._format_query( query, "%s/%s" % ("CPUCreditBalance", instance_id), { "MetricName": "CPUCreditBalance", "Namespace": "AWS/EC2", "Dimensions": [{ "Name": "InstanceId", "Value": instance_id }], "Period": 300, "Statistic": "Average" }) self.ec2.set_state( "cloudwatch.metrics.last_burstable_metric_collect_date", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) # Make request to CloudWatch query_counter = self.ec2.get_state_int( "cloudwatch.metric.query_counter", default=0) queries = query["Queries"] metric_results = [] metric_ids = [] no_metric_ids = [] while len(queries) > 0: q = queries[:500] queries = queries[500:] results = [] response = None while response is None or "NextToken" in response: args = { "MetricDataQueries": q, "StartTime": now - timedelta(seconds=Cfg.get_duration_secs( "cloudwatch.metrics.data_period")), "EndTime": now } if response is not None: args["NextToken"] = response["NextToken"] response = client.get_metric_data(**args) results.extend(response["MetricDataResults"]) query_counter += len(q) for r in results: if r["StatusCode"] != "Complete": log.error("Failed to retrieve metrics: %s" % q) continue metric_id = query["IdMapping"][r["Id"]] if len(r["Timestamps"]) == 0: if metric_id not in no_metric_ids: no_metric_ids.append(metric_id) continue if metric_id not in metric_ids: metric_ids.append(metric_id) r["_MetricId"] = metric_id r["_SamplingTime"] = str(now) log.debug(r) metric_results.append(r) if len(no_metric_ids): log.info("No metrics returned for alarm '%s'" % no_metric_ids) # Merge with existing cache metric metric_cache = self.metric_cache self.metric_cache = metric_results for m in metric_cache: max_retention_period = Cfg.get_duration_secs( "cloudwatch.metrics.cache.max_retention_period") if m["_MetricId"] in metric_ids or "_SamplingTime" not in m: continue if (now - misc.str2utc(m["_SamplingTime"]) ).total_seconds() < max_retention_period: self.metric_cache.append(m) self.ec2.set_state("cloudwatch.metric.query_counter", query_counter, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.ec2.set_state_json( "cloudwatch.metrics.cache", self.metric_cache, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.set_metric("Cloudwatch.GetMetricData", query_counter) # Augment Alarm definitions and Instances with associated metrics for metric in self.metric_cache: metric_id = metric["_MetricId"] alarm_data = self.get_alarm_data_by_name(metric_id) if alarm_data is not None: alarm_data["MetricDetails"] = metric continue instance = next( filter( lambda i: "CPUCreditBalance/%s" % i["InstanceId"] == metric_id, burstable_instances), None) if instance is not None: instance["_Metrics"] = {} instance["_Metrics"]["CPUCreditBalance"] = metric continue
def get_date(key, default=None): v = get(key) if v is None: return default return misc.str2utc(v, default=default)
def get_prerequisites(self): """ Gather instance status by calling SSM APIs. """ if not Cfg.get_int("ssm.enable"): log.log(log.NOTICE, "SSM support is currently disabled. Set ssm.enable to 1 to enabled it.") return now = self.context["now"] self.ttl = Cfg.get_duration_secs("ssm.state.default_ttl") GroupName = self.context["GroupName"] misc.initialize_clients(["ssm"], self.context) client = self.context["ssm.client"] # Retrive all SSM maintenace windows applicable to this CloneSquad deployment mw_names = { "__globaldefault__": {}, "__default__": {}, "__main__": {}, "__all__": {} } fmt = self.context.copy() mw_names["__globaldefault__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.global_defaults", fmt=fmt) mw_names["__default__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.defaults", fmt=fmt) mw_names["__main__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.mainfleet.defaults", fmt=fmt) mw_names["__all__"]["Names"] = Cfg.get_list("ssm.feature.maintenance_window.subfleet.__all__.defaults", fmt=fmt) all_mw_names = mw_names["__globaldefault__"]["Names"] all_mw_names.extend([ n for n in mw_names["__default__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__main__"]["Names"] if n not in all_mw_names]) all_mw_names.extend([ n for n in mw_names["__all__"]["Names"] if n not in all_mw_names]) Cfg.register({ f"ssm.feature.maintenance_window.subfleet.__all__.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.__main__.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) for SubfleetName in self.o_ec2.get_subfleet_names(): fmt["SubfleetName"] = SubfleetName mw_names[f"Subfleet.{SubfleetName}"] = {} Cfg.register({ f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.ec2.schedule.min_instance_count"), f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running": Cfg.get("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"), f"ssm.feature.events.ec2.scaling_state_changes.draining.{SubfleetName}.connection_refused_tcp_ports": Cfg.get("ssm.feature.events.ec2.scaling_state_changes.draining.connection_refused_tcp_ports") }) mw_names[f"Subfleet.{SubfleetName}"]["Names"] = Cfg.get_list(f"ssm.feature.maintenance_window.subfleet.{SubfleetName}.defaults", fmt=fmt) all_mw_names.extend([ n for n in mw_names[f"Subfleet.{SubfleetName}"]["Names"] if n not in all_mw_names]) names = all_mw_names mws = [] while len(names): paginator = client.get_paginator('describe_maintenance_windows') response_iterator = paginator.paginate( Filters=[ { 'Key': 'Name', 'Values': names[:20] }, ]) for r in response_iterator: for wi in r["WindowIdentities"]: if not wi["Enabled"]: log.log(log.NOTICE, f"SSM Maintenance Window '%s' not enabled. Ignored..." % wi["Name"]) continue if "NextExecutionTime" not in wi: log.log(log.NOTICE, f"/!\ SSM Maintenance Window '%s' without 'NextExecutionTime'." % wi["Name"]) if wi not in mws: mws.append(wi) names = names[20:] # Make string dates as object dates for d in mws: if "NextExecutionTime" in d: d["NextExecutionTime"] = misc.str2utc(d["NextExecutionTime"]) # Retrieve Maintenace Window tags with the resourcegroup API tagged_mws = self.context["o_state"].get_resources(service="ssm", resource_name="maintenancewindow") for tmw in tagged_mws: mw_id = tmw["ResourceARN"].split("/")[1] mw = next(filter(lambda w: w["WindowId"] == mw_id, mws), None) if mw: mw["Tags"] = tmw["Tags"] valid_mws = [] for mw in mws: mw_id=mw["WindowId"] if "Tags" not in mw: try: response = client.list_tags_for_resource(ResourceType='MaintenanceWindow', ResourceId=mw_id) mw["Tags"] = response['TagList'] if 'TagList' in response else [] except Exception as e: log.error(f"Failed to fetch Tags for MaintenanceWindow '{mw_id}'") if ("Tags" not in mw or not len(mw["Tags"])) and mw["Name"] not in mw_names["__globaldefault__"]["Names"]: log.warning(f"Please tag SSM Maintenance Window '%s/%s' with 'clonesquad:group-name': '%s'!" % (mw["Name"], mw["WindowId"], self.context["GroupName"])) continue valid_mws.append(mw) self.maintenance_windows = { "Names": mw_names, "Windows": valid_mws } # Update asynchronous results from previously launched commands self.update_pending_command_statuses() # Perform maintenance window house keeping self.manage_maintenance_windows() if len(mws): log.log(log.NOTICE, f"Found matching SSM maintenance windows: %s" % self.maintenance_windows["Windows"]) # Hard dependency toward EC2 module. We update the SSM instance initializing states self.o_ec2.update_ssm_initializing_states()
def seconds_since_last_call(): if "main.last_call_date" not in ctx: return 0 return (misc.utc_now() - misc.str2utc( ctx["main.last_call_date"], default=misc.epoch())).total_seconds()