def set_kv(self, key, value, partition=None, TTL=None): now = self.context["now"] client = self.context["dynamodb.client"] if TTL is None: ttl = 0 else: ttl = int(TTL) k = key if partition is None else "[%s]%s" % (partition, key) # Optimize writes to KV table to reduce cost: Only write to the KV # when value is different than in the cache or half-way of # expiration time if self.table_cache is not None: item = self.get_item(key, partition=partition) now = self.context["now"] if (item is not None and "ExpirationTime" in item and item["Value"] == str(value) and (misc.seconds_from_epoch_utc() + (ttl / 2)) < int(item["ExpirationTime"])): log.debug("KVtable: Optimized write to '%s' with value '%s'" % (k, value)) return if not self.is_aggregated_key(k): KVTable.set_kv_direct(k, value, self.table_name, TTL=ttl, context=self.context) # Update cache if self.table_cache is None: # KV_Table not yet initialized return expiration_time = misc.seconds_from_epoch_utc(now=now) + ttl new_item = { "Key": k, "Value": str(value), "ExpirationTime": int(expiration_time) } if item is not None: if str(value) == "": self.table_cache.remove(item) else: item.update(new_item) else: self.table_cache.append(new_item) # Rebuild the dict representation self._build_dict()
def set_kv_direct(key, value, table_name, partition=None, TTL=None, context=None): if context is None: client = boto3.client("dynamodb") now = None else: client = context["dynamodb.client"] now = context["now"] log.debug( "KVTable: dynamodb.put_item(TableName=%s, Key=%s, Value='%s'" % (table_name, key, value)) existing_object = KVTable._get_cache_for_tablename(table_name) if existing_object is not None: existing_object.table_cache_dirty = True if value is None or str(value) == "": client.delete_item(Key={"Key": {"S": key}}, TableName=table_name) else: query = {'Key': {'S': key}, 'Value': {'S': str(value)}} if TTL != 0 and TTL is not None: expiration_time = misc.seconds_from_epoch_utc(now=now) + TTL query.update({'ExpirationTime': {'N': str(expiration_time)}}) log.log( log.NOTICE, f"DynamoDB({table_name}): Writing key '{key}' (TTL={TTL}, size=%s)" % len(str(value))) response = client.put_item(TableName=table_name, ReturnConsumedCapacity='TOTAL', Item=query)
def update_pending_command_statuses(self): client = self.context["ssm.client"] self.run_cmd_states = self.o_state.get_state_json("ssm.events.run_commands", default={ "Commands": [], "FormerResults": {} }) former_results = self.run_cmd_states["FormerResults"] cmds = self.run_cmd_states["Commands"] for cmd in cmds: command = cmd["Command"] args = cmd["CommandArgs"] if "Complete" not in cmd: cmd_id = cmd["Id"] paginator = client.get_paginator('list_command_invocations') response_iterator = paginator.paginate(CommandId=cmd_id, Details=True, MaxResults=50) for response in response_iterator: for invoc in response["CommandInvocations"]: instance_id = invoc["InstanceId"] status = invoc["Status"] if (status not in ["Success", "Cancelled", "Failed", "TimedOut", "Undeliverable", "Terminated", "Delivery Timed Out", "Execution Timed Out"]): continue stdout = [s.rstrip() for s in io.StringIO(invoc["CommandPlugins"][0]["Output"]).readlines() if s.startswith("CLONESQUAD-SSM-AGENT-")] bie_msg = next(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-BIE:"), stdout), None) if not bie_msg: log.log(log.NOTICE, f"Truncated reply from SSM Command Invocation ({cmd_id}/{instance_id}). " "*Cause: SSM exec error? started shell command too verbose? (please limit to 24kBytes max!)") agent_status = "CLONESQUAD-SSM-AGENT-STATUS:" status_msg = next(filter(lambda s: s.startswith(agent_status), stdout), None) if status_msg is None: status_msg = "ERROR" else: status_msg = status_msg[len(agent_status):] details_msg = list(filter(lambda s: s.startswith("CLONESQUAD-SSM-AGENT-DETAILS:"), stdout)) warning_msg = list(filter(lambda s: ":WARNING:" in s, stdout)) if len(warning_msg): log.warning(f"Got warning while retrieving SSM RunCommand output for {cmd_id}/{instance_id}/{command}: " f"{warning_msg}/{details_msg}") result = { "SSMInvocationStatus": status, "Status": status_msg, "Truncated": bie_msg is None, "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.result.default_ttl") } # Keep track if the former result list if instance_id not in former_results: former_results[instance_id] = {} former_results[instance_id][f"{command};{args}"] = result if instance_id not in cmd["ReceivedInstanceIds"]: cmd["ReceivedInstanceIds"].append(instance_id) if set(cmd["ReceivedInstanceIds"]) & set(cmd["InstanceIds"]) == set(cmd["InstanceIds"]): # All invocation results received cmd["Complete"] = True self.commands_to_send = []
def get_item(self, key, partition=None): now = self.context["now"] k = key if partition is None else "[%s]%s" % (partition, key) for item in self.table_cache: if "Key" in item and item["Key"] == k: if "ExpirationTime" not in item or int( item["ExpirationTime"]) > misc.seconds_from_epoch_utc( now=now): return item # Expired record. Garbage collect it now... self.table_cache.remove(item) self._build_dict() return None return None
def _safe_key_import(self, dest_list, aggregate, records, exclude_aggregate_key=True): now = misc.seconds_from_epoch_utc(now=self.context["now"]) existing_keys = [d["Key"] for d in dest_list] if aggregate is None: for r in records: key = r["Key"] if key in existing_keys: log.error("Duplicate KV key '%s'!" % key) continue if "ExpirationTime" in r: if int(r["ExpirationTime"]) < now: continue existing_keys.append(key) dest_list.append(r) return excludes = aggregate["Exclude"] if "Exclude" in aggregate else [] prefix = aggregate["Prefix"] for r in records: if "ExpirationTime" in r: if int(r["ExpirationTime"]) < now: continue key = r["Key"] if not key.startswith(prefix) or (exclude_aggregate_key and key == prefix) or ( exclude_aggregate_key and key.endswith(".")): continue if key in existing_keys: log.error("Duplicate KV key '%s'!" % key) continue excluded = next( filter(lambda e: key.startswith(e) and key != e, excludes), None) if excluded is not None: continue existing_keys.append(key) dest_list.append(r)
def persist_aggregates(): global all_kv_objects seconds_from_epoch = misc.seconds_from_epoch_utc() for t in all_kv_objects: if t.table_cache is None or not t.is_kv_table or len( t.aggregates) == 0: continue log.debug("Persisting aggregates for KV table '%s'..." % t.table_name) xray_recorder.begin_subsegment("persist_aggregates.%s" % t.table_name) for aggregate in t.aggregates: serialized = [] ttl = 0 compress = aggregate[ "Compress"] if "Compress" in aggregate else False prefix = aggregate["Prefix"] t._safe_key_import(serialized, aggregate, t.table_cache) for i in serialized: ttl = max( ttl, i["ExpirationTime"] - seconds_from_epoch) if "ExpirationTime" in i else ttl if len(serialized) == 0: ttl = aggregate["DefaultTTL"] if log.level == log.DEBUG: log.log( log.NOTICE, "Delta between aggregate '%s' in DynamoDB and the new one:" % prefix) #if misc.encode_json(serialized, compress=compress) == t.get_kv(prefix): pdb.set_trace() if KVTable.compare_kv_list( serialized, misc.decode_json(t.get_kv(prefix))) == 0: pass #pdb.set_trace() log.log(log.NOTICE, "Delta end.") t.set_kv(prefix, misc.encode_json(serialized, compress=compress), TTL=ttl) if t.table_cache_dirty: t.set_kv("cache.last_write_index", t.context["now"], TTL=0) xray_recorder.end_subsegment()
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs): global records global notify_mgr record = {} record["EventType"] = f.__name__ record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)} managed_exception = None xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__) try: r = f(*args, **kwargs) record["Output"] = json.dumps(r, default=str) except Exception as e: managed_exception = e record["Except"] = { "Exception": traceback.format_exc(), "Stackstrace": traceback.extract_stack(), "Reason": json.dumps(e, default=str) } xray_recorder.end_subsegment() if managed_exception is not None: # Persist now all aggregated data to not lose them xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" % f.__name__) try: KVTable.persist_aggregates() except Exception as e: log.exception("Failed to persist aggregated date!") xray_recorder.end_subsegment() if notify_mgr is None or do_not_notify: log.debug( "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s" % (notify_mgr, do_not_notify)) if managed_exception is not None: raise managed_exception return r ctx = notify_mgr.context try: need_longterm_record = managed_exception is not None or not is_success_func( args, kwargs, r) if is_success_func is not None else False except Exception as e: log.exception( "Got an exception while assessing long term event management : %s" % e) need_longterm_record = True # Try to catch the maximum available metadata to ease later diagnosis # Protect against exceptions to ensure proper logging record["Metadata"] = {} xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__) try: notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True) record["Metadata"]["EC2"] = { "AllInstanceDetails": notify_mgr.ec2.get_instances(), "AllInstanceStatuses": notify_mgr.ec2.get_instance_statuses(), "DrainingInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="draining") ], "BouncedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="bounced") ], "ExcludedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="excluded") ], "ErrorInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="error") ], "ScalingStates": notify_mgr.ec2.get_all_scaling_states() } except Exception as e: log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" % f.__name__) try: notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True) record["Metadata"][ "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info() except Exception as e: log.exception( 'Failed to create record["Metadata"]["TargetGroups"] : %s' % e) xray_recorder.end_subsegment() for key in ["Metadata"]: zipped_bytes = gzip.compress( bytes(json.dumps(record[key], default=str), "utf-8")) record[key] = str(base64.b64encode(zipped_bytes), "utf-8") now = misc.utc_now() now_seconds = misc.seconds_from_epoch_utc() max_longterm_records = Cfg.get_int("notify.event.longterm.max_records") if max_longterm_records <= 0: need_longterm_record = 0 tables = [ { "Name": ctx["EventTable"], "NeedWrite": need_shortterm_record, "TTL": Cfg.get_duration_secs("notify.event.default_ttl"), "DBImages": False, "DebugReport": False }, { "Name": ctx["LongTermEventTable"], "NeedWrite": need_longterm_record, "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "DBImages": True, "DebugReport": True }, ] xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__) for table in tables: if not table["NeedWrite"]: continue UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, " UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime" ExpressionAttributeValues = { ':entrypoint': { 'S': ctx["FunctionName"] }, ':eventtype': { 'S': record["EventType"] }, ':input': { 'S': json.dumps(record["Input"], default=str) }, ':output': { 'S': json.dumps(record["Output"] if "Output" in record else {}, default=str) }, ':exception': { 'S': json.dumps(record["Except"] if "Except" in record else "", default=str) }, ':metadata': { 'S': json.dumps(record["Metadata"], default=str) }, ':expirationtime': { 'N': str(now_seconds + table["TTL"]) } } if table["DBImages"]: # Insert snapshots of the CloudWatch dashboard try: log.log(log.NOTICE, "Generating snapshots for Dashboard graphs...") images = notify_mgr.cloudwatch.get_dashboard_images() for i in images: compressed_name = i.replace(" ", "") UpdateExpression += ", Graph_%s_PNG=:graph%s" % ( compressed_name, compressed_name) ExpressionAttributeValues[":graph%s" % compressed_name] = { 'S': images[i] } log.info( "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!" % table["Name"]) except Exception as e: log.exception( "Failed to retrieve CloudWatch snapshot images! : %s" % e) response = ctx["dynamodb.client"].update_item( Key={"EventDate": { 'S': str(now) }}, UpdateExpression=UpdateExpression, ExpressionAttributeValues=ExpressionAttributeValues, ReturnConsumedCapacity='TOTAL', TableName=table["Name"], ) log.debug(Dbg.pprint(response)) log.log( log.NOTICE, "Written event '[%s] %s' to table '%s'." % (str(now), record["EventType"], table["Name"])) # Keep under control the number of LongTerm items stored in DynamoDB table if need_longterm_record: longterm_item_eventdates = [ m["_"] for m in notify_mgr.state.get_metastring_list( "notify.longterm.itemlist", default=[]) ] log.log(log.NOTICE, "Guessed number of records in LongTerm Event table : %d", len(longterm_item_eventdates)) longterm_item_eventdates.append(str(now)) nb_records_to_delete = max( len(longterm_item_eventdates) - max_longterm_records, 0) for eventdate in longterm_item_eventdates[:nb_records_to_delete]: try: response = ctx["dynamodb.client"].delete_item( Key={'EventDate': { 'S': eventdate }}, TableName=ctx["LongTermEventTable"]) log.debug(response) log.log( log.NOTICE, "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)" % (eventdate, max_longterm_records)) except Exception as e: log.exception( "Got exception while deleting LongTerm record '%s' : %e" % (eventdate, e)) notify_mgr.state.set_state( "notify.longterm.itemlist", ";".join(longterm_item_eventdates[nb_records_to_delete:]), TTL=Cfg.get_duration_secs("notify.event.longterm.ttl")) try: KVTable.persist_aggregates() except Exception as e: log.exception("Got exception while persisting KVTables : %s" % e) # Manage Debug report export to S3 url = ctx["LoggingS3Path"] if url != "" and table["DebugReport"] and Cfg.get_int( "notify.debug.send_s3_reports"): xray_recorder.begin_subsegment( "notifycall-publish_all_reports:%s" % f.__name__) if ctx["FunctionName"] == "Interact": # Avoid recursion if throwing from InteractFunction log.info("Publishing Debug reports synchronously...") debug.publish_all_reports(ctx, url, "notifymgr_report") else: client = ctx["sqs.client"] log.info( "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..." % ctx["InteractSQSUrl"]) response = client.send_message(QueueUrl=ctx["InteractSQSUrl"], MessageBody=json.dumps({ "OpType": "Debug/PublishReportNow", "Events": { "Timestamp": str(ctx["now"]) } })) log.debug(response) xray_recorder.end_subsegment() xray_recorder.end_subsegment() if managed_exception is not None: raise managed_exception return r
def send_commands(self): if not Cfg.get_int("ssm.enable"): return client = self.context["ssm.client"] refs = { "Linux": { "document": "AWS-RunShellScript", "shell": [s.rstrip() for s in io.StringIO(str(misc.get_url("internal:cs-ssm-agent.sh"), "utf-8")).readlines()], "ids": [], } } # Purge already replied results valid_cmds = [] for cmd in self.run_cmd_states["Commands"]: if cmd.get("Complete") or cmd["Expiration"] < misc.seconds_from_epoch_utc(): continue valid_cmds.append(cmd) self.run_cmd_states["Commands"] = valid_cmds # Purge outdated former results former_results = self.run_cmd_states["FormerResults"] for i in list(former_results.keys()): for cmd in list(former_results[i].keys()): if former_results[i][cmd]["Expiration"] < misc.seconds_from_epoch_utc(): del former_results[i][cmd] if len(former_results[i].keys()) == 0: del former_results[i] # Send commands for cmd in self.commands_to_send: platforms = {} for i in cmd["InstanceIds"]: info = self.is_instance_online(i) if info is None: continue platform_type = info["PlatformType"] pltf = refs.get(platform_type) if pltf is None: log.warning("Can't run a command on an unsupported platform : %s" % info["PlatformType"]) continue # Unsupported platform if platform_type not in platforms: platforms[platform_type] = copy.deepcopy(pltf) if i not in platforms[platform_type]["ids"]: platforms[platform_type]["ids"].append(i) command = cmd["Command"] args = cmd["CommandArgs"] for p in platforms: pltf = platforms[p] instance_ids = pltf["ids"] if not len(instance_ids): continue document = pltf["document"] shell = pltf["shell"] i_ids = instance_ids # Perform string parameter substitutions in the helper script shell_input = [l.replace("##Cmd##", command) for l in shell] shell_input = [l.replace("##ApiGwUrl##", self.context["InteractAPIGWUrl"]) for l in shell_input] if isinstance(args, str): shell_input = [l.replace("##Args##", args) for l in shell_input] else: shell_input = [l.replace("##Args##", args["Args"] if "Args" in args else "") for l in shell_input] for s in args: shell_input = [l.replace(f"##{s}##", str(args[s])) for l in shell_input] while len(i_ids): log.log(log.NOTICE, f"SSM SendCommand({p}): {command}({args}) to %s." % i_ids[:50]) try: response = client.send_command( InstanceIds=i_ids[:50], DocumentName=document, TimeoutSeconds=cmd["Timeout"], Comment=cmd["Comment"], Parameters={ 'commands': shell_input, 'executionTimeout': [str(cmd["Timeout"])] }, MaxConcurrency='100%', MaxErrors='100%', CloudWatchOutputConfig={ 'CloudWatchLogGroupName': self.context["SSMLogGroup"], 'CloudWatchOutputEnabled': True } ) self.run_cmd_states["Commands"].append({ "Id": response["Command"]["CommandId"], "InstanceIds": i_ids[:50], "ReceivedInstanceIds": [], "Command": command, "CommandArgs": args, "Results": {}, "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.default_ttl") }) log.log(log.NOTICE, f"SSM RunCommand (Id:%s) : {command}({args})" % response["Command"]["CommandId"]) except Exception as e: # Under rare circumstance, we can receive an Exception while trying to send log.log(log.NOTICE, f"Failed to do SSM SendCommand : {e}, %s" % i_ids[:50]) i_ids = i_ids[50:] self.o_state.set_state_json("ssm.events.run_commands", self.run_cmd_states, compress=True, TTL=self.ttl)
help="LoadBalancer URL", type=str, nargs=1) parser.add_argument('--period', help="Duration", type=str, default="hours=2") parser.add_argument('--max-concurrency', help="Connection concurrency to load balancer", type=int, default=30) args = parser.parse_args() args_dict = {} for a in args._get_kwargs(): args_dict[a[0]] = a[1] period = misc.str2duration_seconds(args.period) time_offset = misc.seconds_from_epoch_utc() max_concurrency = args.max_concurrency while True: now = misc.seconds_from_epoch_utc() seconds = now - time_offset concurrency = 1 + int( (max_concurrency - 1) * ((1 - math.cos(2 * math.pi * (seconds % period) / period)) / 2.0)) cmd = "ab -c %(concurrency)s -n %(concurrency)s %(loadbalancer_url)s" % { "concurrency": concurrency, "loadbalancer_url": args.loadbalancer_url[0] } print(cmd) os.system(cmd)
def handler(self, event, context): # Protect from bad data and keep only SNS messages if "Records" not in event: log.error("Not a valid SNS event") return sns_records = [] for sns_msg in event["Records"]: if "EventSource" in sns_msg and sns_msg["EventSource"] == "aws:sns": try: sns_msg["_decoded_message"] = json.loads( sns_msg["Sns"]["Message"]) sns_records.append(sns_msg) except Exception as e: log.exception("Failed to decode message %s : %s" % (sns_msg, e)) log.debug(Dbg.pprint(sns_records)) need_main_update = False # For each SNS records, we keep track of important data in # a DynamoDB table for sns_msg in sns_records: message = sns_msg["_decoded_message"] timestamp = datetime.fromisoformat( message["StateChangeTime"].replace( "+0000", "")).replace(tzinfo=timezone.utc) alarm_name = message["AlarmName"] new_state_reason = message["NewStateReason"] new_state_value = message["NewStateValue"] namespace = message["Trigger"]["Namespace"] metric_name = message["Trigger"]["MetricName"] dimensions = message["Trigger"]["Dimensions"] instance_id = "None" try: instance_id = next( filter(lambda dimension: dimension['name'] == 'InstanceId', message["Trigger"]["Dimensions"]))["value"] except Exception as e: log.exception( "Failed to get InstanceId from dimension %s : %s" % (message["Trigger"]["Dimensions"], e)) continue now = misc.seconds_from_epoch_utc() response = self.context["dynamodb.client"].update_item( Key={"AlarmName": { 'S': alarm_name }}, UpdateExpression= "set InstanceId=:instanceid, %s_LastAlarmTimeStamp=:timestamp, %s_LastNewStateReason=:lastnewstatereason," "%s_LastMetricName=:lastmetricname, %s_LastMetricNamespace=:lastmetricnamespace, " "%s_Event=:event," "ExpirationTime=:expirationtime," "LastRecordUpdateTime=:lastrecordupdatetime" % (new_state_value, new_state_value, new_state_value, new_state_value, new_state_value), ExpressionAttributeValues={ ':instanceid': { 'S': instance_id }, ':timestamp': { 'S': str(timestamp) }, ':lastnewstatereason': { 'S': new_state_reason }, ':lastmetricname': { 'S': metric_name }, ':lastmetricnamespace': { 'S': namespace }, ':event': { 'S': json.dumps(message) }, ':expirationtime': { 'N': str(now + Cfg.get_duration_secs( "snsmgr.record_expiration_delay")) }, ':lastrecordupdatetime': { 'N': str(now) } }, ReturnConsumedCapacity='TOTAL', TableName=self.context["AlarmStateEC2Table"], ) need_main_update = True if need_main_update: # Send a message to wakeup the Main Lambda function that is in # charge to take appropriate decision sqs.call_me_back_send(self.ec2) log.debug("Sent SQS message to Main lambda queue: %s" % self.context["MainSQSQueue"])