def interact_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ global ctx ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Interact" init() print(Dbg.pprint(event)) load_prerequisites(["o_state", "o_ec2", "o_notify", "o_targetgroup", "o_scheduler", "o_interact"]) response = {} if ctx["o_interact"].handler(event, context, response): log.log(log.NOTICE, "API Gateway response: %s" % response) sqs.process_sqs_records(event) return response
def debug_report_generator(ctx): report = {"GenerationDate": misc.utc_now()} # Collect all DynamoDB table content report["DynamoDBTables"] = {} for i in ctx: if i.endswith("Table"): table_name = ctx[i] try: table_data = misc.dynamodb_table_scan(ctx["dynamodb.client"], table_name) except Exception as e: log.exception("Failed to retrieve DynamoDB table '%s' : %s" % (table_name, e)) continue report["DynamoDBTables"][table_name] = table_data return report
def sns_handler(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ global ctx ctx["now"] = misc.utc_now() log.log(log.NOTICE, "Handler start.") ctx["FunctionName"] = "SNS" log.info("Processing start (version=%s)" % (ctx.get("CloneSquadVersion"))) init() misc.load_prerequisites(ctx, ["o_state", "o_notify", "o_targetgroup"]) Cfg.dump() sns_mgr = sns.SNSMgr(ctx, ctx["o_ec2"]) r = sns_mgr.handler(event, context) # Persist all aggregated data KVTable.persist_aggregates() # Call me back if needed call_me_back_send() log.log(log.NOTICE, "Normal end.") return r
def interact_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ global ctx ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Interact" log.info("Processing start (version=%s)" % (ctx.get("CloneSquadVersion"))) init() notify.do_not_notify = True # We do not want notification and event management in the Interact function #log.info(json.dumps(event)) if ctx["LoggingS3Path"] != "" and Cfg.get_int("app.archive_interact_events"): s3path = "%s/InteractEvents/%s.json" % (ctx["LoggingS3Path"], ctx["now"]) log.warning("Pushing Interact event in '%s'!" % s3path) misc.put_s3_object(s3path, Dbg.pprint(event)) response = {} if ctx["o_interact"].handler(event, context, response): log.debug("API Gateway response: %s" % response) sqs.process_sqs_records(ctx, event) return response
def discovery_handler(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ global ctx ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Discovery" log.info("Processing start (version=%s)" % (ctx.get("CloneSquadVersion"))) discovery = misc.discovery(ctx) log.debug(discovery) return discovery
def main_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ #print(Dbg.pprint(event)) ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Main" init() if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local(): log.warning("Application disabled due to 'app.disable' key") return no_is_called_too_early = False # Manage Spot interruption as fast as we can if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx): log.info("Managed Spot Interruption SQS record!") # Force to run now disregarding `app.run_period` as we have at least one Spot instance to # remove from target groups immediatly no_is_called_too_early = True # Check that we are not called too early # Note: We peform a direct read to the KVTable to spare initialization time when the # Lambda is called too early ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True) if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": ctx["main.last_call_date"] = str(misc.epoch()) if not no_is_called_too_early and is_called_too_early(): log.log(log.NOTICE, "Called too early by: %s" % event) notify.do_not_notify = True sqs.process_sqs_records(event) sqs.call_me_back_send() return log.debug("Load prerequisites.") load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"]) # Remember 'now' as the last execution date ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl")) Cfg.dump() # Perform actions: log.debug("Main processing.") ctx["o_targetgroup"].manage_targetgroup() ctx["o_ec2_schedule"].schedule_instances() ctx["o_ec2_schedule"].stop_drained_instances() ctx["o_cloudwatch"].configure_alarms() ctx["o_rds"].manage_subfleet_rds() ctx["o_ec2_schedule"].prepare_metrics() ctx["o_cloudwatch"].send_metrics() ctx["o_cloudwatch"].configure_dashboard() # If we got woke up by SNS, acknowledge the message(s) now sqs.process_sqs_records(event) ctx["o_notify"].notify_user_arn_resources() # Call me back if needed sqs.call_me_back_send()
from notify import record_call_lt as RLT import debug as Dbg import config as Cfg from aws_xray_sdk import global_sdk_config global_sdk_config.set_sdk_enabled("AWS_XRAY_SDK_ENABLED" in os.environ and os.environ["AWS_XRAY_SDK_ENABLED"] in ["1", "True", "true"]) from aws_xray_sdk.core import xray_recorder from aws_xray_sdk.core import patch_all patch_all() import cslog log = cslog.logger(__name__) log.debug("App started.") # Import environment variables ctx = {"now": misc.utc_now()} sqs.ctx = ctx config.ctx = ctx for env in os.environ: ctx[env] = os.getenv(env) def fix_sam_bugs(): account_id = os.getenv("ACCOUNT_ID") # 2020/07/28: SAM local bug: DynamoDB tables and SNS Topics are not correctly propagated. Patch them manually ctx["ConfigurationTable"] = "CloneSquad-%s%s-Configuration" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["AlarmStateEC2Table"] = "CloneSquad-%s%s-AlarmState-EC2" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["StateTable"] = "CloneSquad-%s%s-State" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["EventTable"] = "CloneSquad-%s%s-EventLog" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["LongTermEventTable"] = "CloneSquad-%s%s-EventLog-LongTerm" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["SchedulerTable"] = "CloneSquad-%s%s-Scheduler" % (ctx["GroupName"], ctx["VariantNumber"]) ctx["MainSQSQueue"] = "https://sqs.%s.amazonaws.com/%s/CloneSquad-Main-%s" % (ctx["AWS_DEFAULT_REGION"], account_id, ctx["GroupName"])
def notify_user_arn_resources(self): # Notify specified resources if needed user_notification_arns = self.context["UserNotificationArns"].split( ",") notification_message = {} for arn in user_notification_arns: if "*" in arn or "?" in arn: # Ignore an ARN pattern: continue if arn == "": continue m = re.search("^arn:[a-z]+:([a-z]+):([-a-z0-9]+):([0-9]+):(.+)", arn) if len(m.groups()) < 4: log.warning( "Failed to parse User supplied notification ARN '%s'!" % arn) continue notification_message[arn] = {} notification_message[arn]["service"] = m[1] notification_message[arn]["region"] = m[2] notification_message[arn]["account_id"] = m[3] notification_message[arn]["service_path"] = m[4] notification_message[arn]["content"] = { "Date": misc.utc_now(), "Metadata": { "AckLambdaARN": self.context["InteractLambdaArn"], "AckSQSUrl": self.context["InteractSQSUrl"] } } if len(notification_message) == 0: return try: dynamodb_client = self.context["dynamodb.client"] event_items = misc.dynamodb_table_scan(dynamodb_client, self.table_name) except Exception as e: log.exception( "Failed to perform table scan on '%s' DynamodDB table! Notifications not sent... : %s " % (self.event_table, e)) return # Flatten the structure to make it easily manageable events = [] for e in event_items: if "AckDate" not in e or e["AckDate"] == "": events.append(e) events.sort(key=lambda x: datetime.fromisoformat(x["EventDate"]), reverse=True) if len(events) == 0: return event_types = [] for e in events: if e["EventType"] not in event_types: event_types.append(e["EventType"]) events_r = events.copy() events_r.reverse() for arn in notification_message.keys(): # Verify that message is not too big to send content = notification_message[arn]["content"] content["Events"] = events service = notification_message[arn]["service"] region = notification_message[arn]["region"] account_id = notification_message[arn]["account_id"] service_path = notification_message[arn]["service_path"] truncated_message = "Truncated to fit message size < 256kB" content_str = json.dumps(content, default=str) while len(content_str) >= 256 * 1024: for e in events_r: if e["Metadata"] != truncated_message: e["Metadata"] = truncated_message break content_str = json.dumps(content, default=str) try: if service == "lambda": self.call_lambda(arn, region, account_id, service_path, content_str, event_types) elif service == "sqs": self.call_sqs(arn, region, account_id, service_path, content_str, event_types) elif service == "sns": self.call_sns(arn, region, account_id, service_path, content_str, event_types) except Exception as e: log.warning("Failed to notify '%s'! Got Exception: %s" % (arn, e))
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs): global records global notify_mgr record = {} record["EventType"] = f.__name__ record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)} managed_exception = None xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__) try: r = f(*args, **kwargs) record["Output"] = json.dumps(r, default=str) except Exception as e: managed_exception = e record["Except"] = { "Exception": traceback.format_exc(), "Stackstrace": traceback.extract_stack(), "Reason": json.dumps(e, default=str) } xray_recorder.end_subsegment() if managed_exception is not None: # Persist now all aggregated data to not lose them xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" % f.__name__) try: KVTable.persist_aggregates() except Exception as e: log.exception("Failed to persist aggregated date!") xray_recorder.end_subsegment() if notify_mgr is None or do_not_notify: log.debug( "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s" % (notify_mgr, do_not_notify)) if managed_exception is not None: raise managed_exception return r ctx = notify_mgr.context try: need_longterm_record = managed_exception is not None or not is_success_func( args, kwargs, r) if is_success_func is not None else False except Exception as e: log.exception( "Got an exception while assessing long term event management : %s" % e) need_longterm_record = True # Try to catch the maximum available metadata to ease later diagnosis # Protect against exceptions to ensure proper logging record["Metadata"] = {} xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__) try: notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True) record["Metadata"]["EC2"] = { "AllInstanceDetails": notify_mgr.ec2.get_instances(), "AllInstanceStatuses": notify_mgr.ec2.get_instance_statuses(), "DrainingInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="draining") ], "BouncedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="bounced") ], "ExcludedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="excluded") ], "ErrorInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="error") ], "ScalingStates": notify_mgr.ec2.get_all_scaling_states() } except Exception as e: log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" % f.__name__) try: notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True) record["Metadata"][ "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info() except Exception as e: log.exception( 'Failed to create record["Metadata"]["TargetGroups"] : %s' % e) xray_recorder.end_subsegment() for key in ["Metadata"]: zipped_bytes = gzip.compress( bytes(json.dumps(record[key], default=str), "utf-8")) record[key] = str(base64.b64encode(zipped_bytes), "utf-8") now = misc.utc_now() now_seconds = misc.seconds_from_epoch_utc() max_longterm_records = Cfg.get_int("notify.event.longterm.max_records") if max_longterm_records <= 0: need_longterm_record = 0 tables = [ { "Name": ctx["EventTable"], "NeedWrite": need_shortterm_record, "TTL": Cfg.get_duration_secs("notify.event.default_ttl"), "DBImages": False, "DebugReport": False }, { "Name": ctx["LongTermEventTable"], "NeedWrite": need_longterm_record, "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "DBImages": True, "DebugReport": True }, ] xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__) for table in tables: if not table["NeedWrite"]: continue UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, " UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime" ExpressionAttributeValues = { ':entrypoint': { 'S': ctx["FunctionName"] }, ':eventtype': { 'S': record["EventType"] }, ':input': { 'S': json.dumps(record["Input"], default=str) }, ':output': { 'S': json.dumps(record["Output"] if "Output" in record else {}, default=str) }, ':exception': { 'S': json.dumps(record["Except"] if "Except" in record else "", default=str) }, ':metadata': { 'S': json.dumps(record["Metadata"], default=str) }, ':expirationtime': { 'N': str(now_seconds + table["TTL"]) } } if table["DBImages"]: # Insert snapshots of the CloudWatch dashboard try: log.log(log.NOTICE, "Generating snapshots for Dashboard graphs...") images = notify_mgr.cloudwatch.get_dashboard_images() for i in images: compressed_name = i.replace(" ", "") UpdateExpression += ", Graph_%s_PNG=:graph%s" % ( compressed_name, compressed_name) ExpressionAttributeValues[":graph%s" % compressed_name] = { 'S': images[i] } log.info( "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!" % table["Name"]) except Exception as e: log.exception( "Failed to retrieve CloudWatch snapshot images! : %s" % e) response = ctx["dynamodb.client"].update_item( Key={"EventDate": { 'S': str(now) }}, UpdateExpression=UpdateExpression, ExpressionAttributeValues=ExpressionAttributeValues, ReturnConsumedCapacity='TOTAL', TableName=table["Name"], ) log.debug(Dbg.pprint(response)) log.log( log.NOTICE, "Written event '[%s] %s' to table '%s'." % (str(now), record["EventType"], table["Name"])) # Keep under control the number of LongTerm items stored in DynamoDB table if need_longterm_record: longterm_item_eventdates = [ m["_"] for m in notify_mgr.state.get_metastring_list( "notify.longterm.itemlist", default=[]) ] log.log(log.NOTICE, "Guessed number of records in LongTerm Event table : %d", len(longterm_item_eventdates)) longterm_item_eventdates.append(str(now)) nb_records_to_delete = max( len(longterm_item_eventdates) - max_longterm_records, 0) for eventdate in longterm_item_eventdates[:nb_records_to_delete]: try: response = ctx["dynamodb.client"].delete_item( Key={'EventDate': { 'S': eventdate }}, TableName=ctx["LongTermEventTable"]) log.debug(response) log.log( log.NOTICE, "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)" % (eventdate, max_longterm_records)) except Exception as e: log.exception( "Got exception while deleting LongTerm record '%s' : %e" % (eventdate, e)) notify_mgr.state.set_state( "notify.longterm.itemlist", ";".join(longterm_item_eventdates[nb_records_to_delete:]), TTL=Cfg.get_duration_secs("notify.event.longterm.ttl")) try: KVTable.persist_aggregates() except Exception as e: log.exception("Got exception while persisting KVTables : %s" % e) # Manage Debug report export to S3 url = ctx["LoggingS3Path"] if url != "" and table["DebugReport"] and Cfg.get_int( "notify.debug.send_s3_reports"): xray_recorder.begin_subsegment( "notifycall-publish_all_reports:%s" % f.__name__) if ctx["FunctionName"] == "Interact": # Avoid recursion if throwing from InteractFunction log.info("Publishing Debug reports synchronously...") debug.publish_all_reports(ctx, url, "notifymgr_report") else: client = ctx["sqs.client"] log.info( "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..." % ctx["InteractSQSUrl"]) response = client.send_message(QueueUrl=ctx["InteractSQSUrl"], MessageBody=json.dumps({ "OpType": "Debug/PublishReportNow", "Events": { "Timestamp": str(ctx["now"]) } })) log.debug(response) xray_recorder.end_subsegment() xray_recorder.end_subsegment() if managed_exception is not None: raise managed_exception return r
def seconds_since_last_call(): if "main.last_call_date" not in ctx: return 0 return (misc.utc_now() - misc.str2utc( ctx["main.last_call_date"], default=misc.epoch())).total_seconds()
def notify_user_arn_resources(self): if self.context["UserNotificationArns"] in ["", "None"]: return # Notify specified resources if needed user_notification_arns = self.context["UserNotificationArns"].split(",") notification_message = {} for arn in user_notification_arns: if "*" in arn or "?" in arn: # Ignore an ARN pattern: continue if arn == "": continue m = re.search("^arn:[a-z]+:([a-z]+):([-a-z0-9]+):([0-9]+):(.+)", arn) if len(m.groups()) < 4: log.warning("Failed to parse User supplied notification ARN '%s'!" % arn) continue notification_message[arn] = {} notification_message[arn]["service"] = m[1] notification_message[arn]["region"] = m[2] notification_message[arn]["account_id"] = m[3] notification_message[arn]["service_path"] = m[4] if len(notification_message) == 0: return try: dynamodb_client = self.context["dynamodb.client"] event_items = misc.dynamodb_table_scan(dynamodb_client, self.table_name) except Exception as e: log.exception("Failed to perform table scan on '%s' DynamodDB table! Notifications not sent... : %s " % (self.event_table, e)) return # Flatten the structure to make it easily manageable events = [] for e in event_items: if "AckDate" not in e or e["AckDate"] == "": events.append(e) events.sort(key=lambda x: datetime.fromisoformat(x["EventDate"]), reverse=True) if len(events) == 0: return # Send events from the older to the younger msg = { "Date" : misc.utc_now(), "Metadata": { "AckLambdaARN" : self.context["InteractLambdaArn"], "AckSQSUrl" : self.context["InteractSQSUrl"], "ApiGWUrl" : self.context["InteractAPIGWUrl"] } } events_r = events.copy() events_r.reverse() # Optimize message size by deduplicating metadata messages = [] while len(events_r): msg["Events"] = [] m = None index = 0 for i in range(0, len(events_r)): if "Metadata" not in events_r[i]: log.error("Malformed event %s/%s read from DynamoDB! (???) Skipping it..." % (events_r[i]["EventDate"], events_r[i]["EventType"])) continue # Size control. Check if the next message will make the message too big # Suppress the metadate field when it has the same value that the previous event meta = events_r[i]["Metadata"] event = events_r[i].copy() if m != meta: m = meta else: del event["Metadata"] msg["Events"].append(event) if len(json.dumps(msg, default=str)) > 255*1024: # Chunk is going to be too big. Create a new chunk now... msg["Events"].remove(event) if i == 0: event_type = events_r[0]["EventType"] event_date = events_r[0]["EventDate"] log.error(f"Notification message too big ({msg_size} > 256kB)! Possible cause is too many instances " "under management... This message {event_date}/{event_type} will be discarded...") break index = i if len(msg["Events"]): messages.append(msg["Events"]) events_r = events_r[index+1:] for m in messages: if len(m) == 0: continue msg["Events"] = m content_str = json.dumps(msg, default=str) event_types = [] [event_types.append(e["EventType"]) for e in m if e["EventType"] not in event_types] log.log(log.NOTICE, "Notification payload size: %s" % len(content_str)) for arn in notification_message.keys(): service = notification_message[arn]["service"] region = notification_message[arn]["region"] account_id = notification_message[arn]["account_id"] service_path = notification_message[arn]["service_path"] try: if service == "lambda": self.call_lambda(arn, region, account_id, service_path, content_str, event_types) elif service == "sqs": self.call_sqs(arn, region, account_id, service_path, content_str, event_types) elif service == "sns": self.call_sns(arn, region, account_id, service_path, content_str, event_types) except Exception as e: log.warning("Failed to notify '%s'! Got Exception: %s" % (arn, e))