class SetupHelperHandler(CustomResource): def __init__(self, event, context): """ Initializes helper setup class :param event: :param context: """ CustomResource.__init__(self, event, context) self.arguments = copy(self.resource_properties) self.arguments = {a: self.resource_properties[a] for a in self.resource_properties if a not in ["ServiceToken", "Timeout"]} self.configuration_bucket = os.getenv(configuration.ENV_CONFIG_BUCKET, None) self.automator_role_arn = self.arguments.get("OpsAutomatorLambdaRole") self.events_forward_role = self.arguments.get("EventForwardLambdaRole") self.ops_automator_topic_arn = self.arguments.get("OpsAutomatorTopicArn") self.use_ecs = TaskConfiguration.as_boolean(self.arguments.get("UseEcs", False)) self.optimize_cross_account_template = TaskConfiguration.as_boolean( (self.arguments.get("OptimizeCrossAccountTemplate", False))) self.account = os.getenv(handlers.ENV_OPS_AUTOMATOR_ACCOUNT) self.stack_version = self.arguments["StackVersion"] # setup logging dt = datetime.utcnow() classname = self.__class__.__name__ logstream = LOG_STREAM.format(classname, dt.year, dt.month, dt.day) self._logger = QueuedLogger(logstream=logstream, context=context, buffersize=50) @classmethod def is_handling_request(cls, event, _): """ Test if the event is handled by this handler :param _: :param event: Event to test :return: True if the event is an event from cloudformationOpsAutomatorSetupHelper custom resource """ return event.get("StackId") is not None and event.get("ResourceType") == "Custom::OpsAutomatorSetupHelper" def handle_request(self): """ Handles the custom resource request from cloudformation :return: """ start = datetime.now() self._logger.info("Cloudformation request is {}", safe_json(self._event, indent=2)) try: result = CustomResource.handle_request(self) return safe_dict({ "result": result, "datetime": datetime.now().isoformat(), "running-time": (datetime.now() - start).total_seconds() }) except Exception as ex: self._logger.error(ERR_HANDLING_SETUP_REQUEST, ex, full_stack()) raise ex finally: self._logger.flush() def _set_lambda_logs_retention_period(self): """ Aligns retention period for default Lambda log streams with settings :return: """ if self._context is None: return log_client = get_client_with_retries("logs", methods=[ "delete_retention_policy", "put_retention_policy", "create_log_group", "describe_log_groups" ], context=self.context) retention_days = self.arguments.get("LogRetentionDays") base_name = self._context.log_group_name[0:-len("Standard")] log_groups = [ base_name + size for size in [ "Standard", "Medium", "Large", "XLarge", "XXLarge", "XXXLarge" ] ] existing_groups = [l["logGroupName"] for l in log_client.describe_log_groups_with_retries(logGroupNamePrefix=base_name).get("logGroups", [])] for group in log_groups: exists = group in existing_groups self._logger.info("Setting retention for log group {}", group) if retention_days is None: if not exists: continue self._logger.info(INF_DELETE_LOG_RETENTION_POLICY, group) log_client.delete_retention_policy_with_retries(logGroupName=group) else: if not exists: log_client.create_log_group(logGroupName=group) self._logger.info(INF_SET_LOG_RETENTION_POLICY, group, retention_days) log_client.put_retention_policy_with_retries(logGroupName=group, retentionInDays=int(retention_days)) def _setup(self): """ OpsAutomatorSetupHelper setup actions :return: """ self._set_lambda_logs_retention_period() if self.configuration_bucket: self.generate_templates() def _send_create_metrics(self): metrics_data = { "Type": "stack", "Version": self.stack_version, "StackHash": sha256(self.stack_id.encode()).hexdigest(), "Data": { "Status": "stack_create", "Region": self.region } } send_metrics_data(metrics_data=metrics_data, logger=self._logger) def _send_delete_metrics(self): metrics_data = { "Type": "stack", "Version": self.stack_version, "StackHash": sha256(self.stack_id.encode()).hexdigest(), "Data": { "Status": "stack_delete", "Region": self.region } } send_metrics_data(metrics_data=metrics_data, logger=self._logger) def _create_request(self): """ Handles create request from cloudformation custom resource :return: """ try: self._setup() self.physical_resource_id = self.__class__.__name__.lower() if allow_send_metrics(): self._send_create_metrics() return True except Exception as ex: self.response["Reason"] = str(ex) return False def _update_request(self): """ Handles update request from cloudformation custom resource :return: """ try: self._setup() return True except Exception as ex: self.response["Reason"] = str(ex) return False def _delete_request(self): """ Handles delete request from cloudformation custom resource :return: """ try: self.delete_templates() self.delete_external_task_config_stacks() if allow_send_metrics(): self._send_delete_metrics() return True except Exception as ex: self.response["Reason"] = str(ex) return False def delete_external_task_config_stacks(self): """ Deletes external stacks that were used to create configuration items :return: """ self._logger.info(INF_DELETING_STACKS) stacks = TaskConfiguration(context=self.context, logger=self._logger).get_external_task_configuration_stacks() if len(stacks) == 0: self._logger.info(INF_NO_STACKS) return self._logger.info(INF_DELETED_STACKS, ", ".join(stacks)) cfn = boto3.resource("cloudformation") for s in stacks: self._logger.info(INF_STACK) try: stack = cfn.Stack(s) add_retry_methods_to_resource(stack, ["delete"], context=self.context) stack.delete_with_retries() except Exception as ex: self._logger.error(ERR_DELETING_STACK, s, str(ex)) def generate_templates(self): """ Generates configuration and cross-account role templates :return: """ def generate_configuration_template(s3, builder, action): configuration_template = S3_KEY_ACTION_CONFIGURATION_TEMPLATE.format(action) self._logger.info(INF_CREATE_ACTION_TEMPLATE, action, configuration_template) template = json.dumps(builder.build_template(action), indent=3) s3.put_object_with_retries(Body=template, Bucket=self.configuration_bucket, Key=configuration_template) def generate_all_actions_cross_account_role_template_parameterized(s3, builder, all_act, template_description): self._logger.info(INF_CREATE_ALL_ACTIONS_CROSS_ROLES_TEMPLATE, S3_KEY_ACCOUNT_CONFIG_WITH_PARAMS) template = builder.build_template(action_list=all_act, description=template_description, with_conditional_params=True) if self.optimize_cross_account_template: template = CrossAccountRoleBuilder.compress_template(template) template_json = json.dumps(template, indent=3) s3.put_object_with_retries(Body=template_json, Bucket=self.configuration_bucket, Key=S3_KEY_ACCOUNT_CONFIG_WITH_PARAMS) # noinspection PyUnusedLocal def generate_all_actions_cross_account_role_template(s3, builder, all_act, template_description): self._logger.info(INF_CREATE_ALL_ACTIONS_CROSS_ROLES_TEMPLATE, S3_KEY_ACCOUNT_CONFIG_CREATE_ALL) template = json.dumps( builder.build_template(action_list=all_act, description=template_description, with_conditional_params=False), indent=3) s3.put_object_with_retries(Body=template, Bucket=self.configuration_bucket, Key=S3_KEY_ACCOUNT_CONFIG_CREATE_ALL) def generate_forward_events_template(s3): self._logger.info(INF_CREATE_EVENT_FORWARD_TEMPLATE, S3_KEY_ACCOUNT_EVENTS_FORWARD_TEMPLATE) template = build_events_forward_template(template_filename="./cloudformation/{}".format(FORWARD_EVENTS_TEMPLATE), script_filename="./forward-events.py", stack=self.stack_name, event_role_arn=self.events_forward_role, ops_automator_topic_arn=self.ops_automator_topic_arn, version=self.stack_version) s3.put_object_with_retries(Body=template, Bucket=self.configuration_bucket, Key=S3_KEY_ACCOUNT_EVENTS_FORWARD_TEMPLATE) def generate_scenario_templates(s3): self._logger.info("Creating task scenarios templates") for template_name, template in list(builders.build_scenario_templates(templates_dir="./cloudformation/scenarios", stack=self.stack_name)): self._logger.info(INF_SCENARIO_TEMPLATE, template_name, S3_KEY_SCENARIO_TEMPLATE_BUCKET) s3.put_object_with_retries(Body=template, Bucket=self.configuration_bucket, Key=S3_KEY_SCENARIO_TEMPLATE_KEY.format(template_name)) def generate_custom_resource_builder(s3): self._logger.info("Create custom resource builder script {}", S3_KEY_CUSTOM_RESOURCE_BUILDER) with open("./build_task_custom_resource.py", "rt") as f: script_text = "".join(f.readlines()) script_text = script_text.replace("%stack%", self.stack_name) script_text = script_text.replace("%account%", self.account) script_text = script_text.replace("%region%", self.region) script_text = script_text.replace("%config_table%", os.getenv("CONFIG_TABLE")) s3.put_object_with_retries(Body=script_text, Bucket=self.configuration_bucket, Key=S3_KEY_CUSTOM_RESOURCE_BUILDER) def generate_actions_html_page(s3): self._logger.info("Generating Actions HTML page {}", S3_KEY_ACTIONS_HTML_PAGE) html = builders.generate_html_actions_page(html_file="./builders/actions.html", region=self.region) s3.put_object_with_retries(Body=html, Bucket=self.configuration_bucket, Key=S3_KEY_ACTIONS_HTML_PAGE, ContentType="text/html") self._logger.info(INF_GENERATING_TEMPLATES, self.configuration_bucket) try: stack = os.getenv(handlers.ENV_STACK_NAME, "") s3_client = get_client_with_retries("s3", ["put_object"], context=self.context) config_template_builder = ActionTemplateBuilder(self.context, service_token_arn="arn:aws:region:account:function:used-for-debug-only", ops_automator_role=self.automator_role_arn, use_ecs=self.use_ecs) role_template_builder = CrossAccountRoleBuilder(self.automator_role_arn, stack) all_actions = [] for action_name in actions.all_actions(): action_properties = actions.get_action_properties(action_name) if not action_properties.get(actions.ACTION_INTERNAL, False): generate_configuration_template(s3_client, config_template_builder, action_name) # Enable to generate a template for every individual action # description = TEMPLATE_DESC_CROSS_ACCOUNT_ACTION.format(action_name, stack, account) # generate_action_cross_account_role_template(s3_client, role_template_builder, action_name, description) all_actions.append(action_name) if len(all_actions) > 0: description = TEMPLATE_DESC_ALL_ACTIONS_PARAMETERS.format(stack, self.account) generate_all_actions_cross_account_role_template_parameterized(s3_client, role_template_builder, all_actions, description) # enable to generate a template with all actions enabled # description = TEMPLATE_DESC_ALL_ACTIONS.format(stack, account) # generate_all_actions_cross_account_role_template(s3_client, role_template_builder, all_actions, description) for action_name in actions.all_actions(): action_properties = actions.get_action_properties(action_name) if action_properties.get(actions.ACTION_EVENTS, None) is not None: generate_forward_events_template(s3_client) break generate_actions_html_page(s3_client) generate_scenario_templates(s3_client) generate_custom_resource_builder(s3_client) except Exception as ex: self._logger.error(ERR_BUILDING_TEMPLATES, str(ex), full_stack()) def delete_templates(self): s3_client = get_client_with_retries("s3", ["delete_object"], context=self.context) s3_key = "" try: for action_name in actions.all_actions(): action_properties = actions.get_action_properties(action_name) if not action_properties.get(actions.ACTION_INTERNAL, False): self._logger.info(INF_DELETING_ACTION_TEMPLATE, action_name) s3_key = S3_KEY_ACTION_CONFIGURATION_TEMPLATE.format(action_name) s3_client.delete_object_with_retries(Bucket=self.configuration_bucket, Key=s3_key) except Exception as ex: self._logger.error(ERR_DELETE_CONFIG_ITEM, s3_key, self.configuration_bucket, str(ex)) self._logger.info(INF_DELETE_ALL_ACTIONS_TEMPLATE) for key in [S3_KEY_ACTIONS_HTML_PAGE, S3_KEY_ACCOUNT_CONFIG_WITH_PARAMS, S3_KEY_ACCOUNT_CONFIG_CREATE_ALL, S3_KEY_ACCOUNT_EVENTS_FORWARD_TEMPLATE]: try: s3_client.delete_object_with_retries(Bucket=self.configuration_bucket, Key=key) except Exception as ex: self._logger.error(ERR_DELETE_CONFIG_ITEM, key, self.configuration_bucket, str(ex))
class TaskTrackingHandler(object): """ Class to handle events triggered by inserting new items in the actions tracking table. """ def __init__(self, event, context): """ Initializes the instance. :param event: Handled event :param context: Context if running in Lambda """ self._context = context self._event = event self._tracking_table = None self._concurrency_table = None self.started_tasks = 0 self.started_waiting_tasks = 0 self.waiting_for_execution_tasks = 0 self.started_completion_checks = 0 self.finished_concurrency_tasks = 0 self.done_work = False self.invoked_lambda_functions = [] self.events_client = None self._s3_client = None self._db_client = None # setup logging classname = self.__class__.__name__ dt = datetime.utcnow() logstream = LOG_STREAM.format(classname, dt.year, dt.month, dt.day) self._logger = QueuedLogger(logstream=logstream, context=self._context, buffersize=20, debug=os.getenv( ENV_DEBUG_TASK_TACKING_HANDLER, "false").lower() == "true") @classmethod def is_handling_request(cls, event, context): # In simulation the handler is called directly when inserting or updating items in the table if handlers.running_local(context): return False if event.get("Records", [{}])[0].get("eventSource", "") != "aws:dynamodb": return False source_arn = event["Records"][0]["eventSourceARN"] table_name = source_arn.split("/")[1] return table_name in [ os.getenv(handlers.ENV_ACTION_TRACKING_TABLE), os.getenv(handlers.ENV_CONCURRENCY_TABLE) ] @classmethod def task_string(cls, action): return TASK_ACTION_STRINGS[action] if 0 <= action < len( TASK_ACTION_STRINGS) else "Unknown" @property def tracking_table(self): """ Gets an instance of the tracking table and use it in subsequent calls :return: Instance tracking table """ if self._tracking_table is None: self._tracking_table = TaskTrackingTable(self._context, self._logger) return self._tracking_table @property def s3_client(self): if self._s3_client is None: self._s3_client = boto_retry.get_client_with_retries( "s3", ["delete_item"], logger=self._logger) return self._s3_client @property def db_client(self): if self._db_client is None: self._db_client = boto_retry.get_client_with_retries( "dynamodb", ["delete_item"], logger=self._logger) return self._db_client def _get_action_concurrency_key(self, item): """ Gets the concurrency key for a tasks action :param item: The task item :return: The concurrency key for the tasks action """ action = item[handlers.TASK_TR_ACTION] # get the name of the optional method to return the concurrency key action_class = actions.get_action_class(action) concurrency_key_method = getattr( action_class, handlers.ACTION_CONCURRENCY_KEY_METHOD, None) # prepare parameters for calling static function that returns the concurrency key if concurrency_key_method is not None: get_key_params = { actions.ACTION_PARAM_RESOURCES: handlers.get_item_resource_data(item, self._context), actions.ACTION_PARAM_ACCOUNT: item[handlers.TASK_TR_ACCOUNT], actions.ACTION_PARAM_STACK: os.getenv(handlers.ENV_STACK_NAME), actions.ACTION_PARAM_STACK_ID: os.getenv(handlers.ENV_STACK_ID), actions.ACTION_PARAM_TASK_ID: item[handlers.TASK_TR_ID], actions.ACTION_PARAM_TASK: item[handlers.TASK_TR_NAME] } get_key_params.update(item.get(handlers.TASK_TR_PARAMETERS)) return concurrency_key_method(get_key_params) else: # if this method is not available for action then use the name of the action as the key return action def _enter_waiting_list(self, concurrency_key): """ Adds 1 to waiting list counter for the specified concurrency key and returns new value :param concurrency_key: Concurrency key for counter :return: Updated counter """ # update/read counter for the concurrency key if not handlers.running_local(self._context): resp = self.concurrency_table.update_item_with_retries( Key={CONCURRENCY_ID: concurrency_key}, UpdateExpression="ADD InstanceCount :one SET RunNext=:run", ExpressionAttributeValues={ ":one": 1, ":run": False }, ReturnValues="UPDATED_NEW") return int(resp["Attributes"].get("InstanceCount", 0)) else: resp = self.concurrency_table.get_item_with_retries( Key={CONCURRENCY_ID: concurrency_key}) return resp.get("Item", {}).get(ACTIVE_INSTANCES, 0) def _leave_waiting_list(self, task_id, concurrency_key): """ Subtracts 1 from waiting list counter for the specified concurrency key and returns new value. If the counter reaches 0 then the entry for the concurrency key is removed :param concurrency_key: Concurrency key for counter :return: Updated counter """ # make a consistent read of the task self.tracking_table.get_task_item(task_id) if not handlers.running_local(self._context): resp = self.concurrency_table.update_item_with_retries( Key={CONCURRENCY_ID: concurrency_key}, UpdateExpression="ADD InstanceCount :min_one SET RunNext=:run", ExpressionAttributeValues={ ":min_one": -1, ":run": True }, ReturnValues="UPDATED_NEW") count = max(0, int(resp["Attributes"].get(ACTIVE_INSTANCES, 0))) # remove entry if no more waiting items for this key if count == 0: self.concurrency_table.delete_item_with_retries( Key={CONCURRENCY_ID: concurrency_key}) else: resp = self.concurrency_table.get_item_with_retries( Key={CONCURRENCY_ID: concurrency_key}) count = resp.get("Item", {}).get(ACTIVE_INSTANCES, 0) TaskTrackingTable._run_local_stream_event( os.getenv(handlers.ENV_CONCURRENCY_TABLE), "UPDATE", { "ConcurrencyId": concurrency_key, "InstanceCount": count }, { "ConcurrencyId": concurrency_key, "InstanceCount": count + 1 }, self._context) return count @property def concurrency_table(self): """ Returns table to store last execution time for this handler. :return: table to store last execution time for this handler """ if self._concurrency_table is None: tablename = os.getenv(handlers.ENV_CONCURRENCY_TABLE) self._logger.debug("Using concurrency table {}", tablename) self._concurrency_table = services.get_session().resource( "dynamodb").Table(tablename) boto_retry.add_retry_methods_to_resource( self._concurrency_table, ["update_item", "get_item", "delete_item"], context=self._context) return self._concurrency_table def _is_wait_listed(self, item): """ Test if there is a max concurrency level for the tasks action. If this is the case then a concurrency key is retrieved from the action and it is used to update the counter in the concurrency table for that key. The updated counter is tested against the max concurrency level for the tasks action :param item: task item :return: True if counter for tasks action concurrency key > mac concurrency level, False if it is less or equal or the action has no max concurrency level """ action = item.get(handlers.TASK_TR_ACTION, None) if action is None: return False action_properties = actions.get_action_properties(action) # test if there are concurrency restrictions max_action_concurrency = action_properties.get( actions.ACTION_MAX_CONCURRENCY) # no maximum if max_action_concurrency in [None, 0]: return False # property may be a lambda function, call the function with parameters of task as lambda parameters if types.FunctionType == type(max_action_concurrency): parameters = item[handlers.TASK_TR_PARAMETERS] max_action_concurrency = max_action_concurrency(parameters) if max_action_concurrency in [None, 0]: return False # get the key for the tasks action concurrency_key = self._get_action_concurrency_key(item) # enter the waiting list for that key count = int(self._enter_waiting_list(concurrency_key)) # set status to waiting if count > max concurrency level status = handlers.STATUS_WAITING if count >= int( max_action_concurrency) else None # store the concurrency key twice, the concurrency id is used for the index in the GSI and is removed after the # action is handled so it does not longer show in the GSI, but we keep another copy in the task tracking table that # we need to decrement the counter in the waiting list and possible start waiting instances with the same key self.tracking_table.update_task(item[handlers.TASK_TR_ID], task=item[handlers.TASK_TR_NAME], task_metrics=item.get( handlers.TASK_TR_METRICS, False), status=status, status_data={ handlers.TASK_TR_CONCURRENCY_KEY: concurrency_key, handlers.TASK_TR_CONCURRENCY_ID: concurrency_key }) if count > max_action_concurrency: self._logger.debug(DEBUG_WAITING, item[handlers.TASK_TR_ACTION], concurrency_key, count, max_action_concurrency, item[handlers.TASK_TR_ID]) return True return False def _start_task_execution(self, task_item, action=handlers.HANDLER_ACTION_EXECUTE): """ Creates an instance of the lambda function that executes the tasks action. It first checks is the action has specific memory requirements and based on this it creates a copy of this instance or one configured for the required memory. All information for executing the action is passed in the event. :param task_item: Task item for which action is executed :return: """ try: self._logger.debug( "Entering start_task_execution ({}) with task {}", action, safe_json(task_item, indent=3)) # Create event for execution of the action and set its action so that is picked up by the execution handler event = {i: task_item.get(i) for i in task_item} event[handlers.HANDLER_EVENT_ACTION] = action self._logger.debug(DEBUG_ACTION, task_item[handlers.TASK_TR_ACTION], task_item[handlers.TASK_TR_NAME], task_item[handlers.TASK_TR_ID]) self._logger.debug( DEBUG_ACTION_PARAMETERS, safe_json(task_item.get(handlers.TASK_TR_PARAMETERS, {}), indent=3)) # get memory allocation for executing the task lambda_size = handlers.TASK_TR_COMPLETION_SIZE \ if action == handlers.HANDLER_ACTION_TEST_COMPLETION \ else handlers.TASK_TR_EXECUTE_SIZE execute_lambda_size = task_item.get(lambda_size, actions.ACTION_SIZE_STANDARD) if execute_lambda_size == actions.ACTION_USE_ECS: ecs_memory = task_item.get( handlers.TASK_EXECUTE_ECS_MEMORY if action == handlers.HANDLER_ACTION_EXECUTE else handlers.TASK_COMPLETION_ECS_MEMORY, None) else: ecs_memory = None if not handlers.running_local(self._context): self._logger.debug(DEBUG_MEMORY_SIZE, execute_lambda_size) if execute_lambda_size != actions.ACTION_USE_ECS: # create event payload payload = str.encode(safe_json(event)) # determine which lambda to execute on function_name = "{}-{}-{}".format( os.getenv(handlers.ENV_STACK_NAME), os.getenv(handlers.ENV_LAMBDA_NAME), execute_lambda_size) self._logger.debug( "Running execution of task on lambda function {}", function_name) self._logger.debug(DEBUG_LAMBDA_FUNCTION_, function_name, payload) # start lambda function lambda_client = boto_retry.get_client_with_retries( "lambda", ["invoke"], context=self._context, logger=self._logger) resp = lambda_client.invoke_with_retries( FunctionName=function_name, InvocationType="Event", LogType="None", Payload=payload) task_info = { "id": task_item[handlers.TASK_TR_ID], "task": task_item[handlers.TASK_TR_NAME], "action": task_item[handlers.TASK_TR_ACTION], "payload": payload, "status-code": resp["StatusCode"] } self._logger.debug(DEBUG_LAMBDA, safe_json(task_info, indent=2)) self.invoked_lambda_functions.append(task_info) else: # run as ECS job ecs_args = { "subnets": os.getenv('AWSVPC_SUBNETS'), "securitygroups": os.getenv('AWSVPC_SECURITYGROUPS'), "assignpublicip": os.getenv('AWSVPC_ASSIGNPUBLICIP'), handlers.HANDLER_EVENT_ACTION: action, handlers.TASK_NAME: task_item[handlers.TASK_TR_NAME], handlers.TASK_TR_ID: task_item[handlers.TASK_TR_ID] } self._logger.debug(DEBUG_RUNNING_ECS_TASK, action, task_item[handlers.TASK_TR_NAME]) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, context=self._context, logger=self._logger) else: lambda_handler(event, self._context) ResultNotifications(context=self._context, logger=self._logger).publish_started(task_item) except Exception as ex: self._logger.error(ERR_RUNNING_TASK, task_item, str(ex), full_stack()) def _handle_new_task_item(self, task_item): """ Handles stream updates for new tasks added to the task tracking table :param task_item: :return: """ self._logger.debug("Handling new task logic") # tasks can be wait listed if there is a max concurrency level for its action if self._is_wait_listed(task_item): self.waiting_for_execution_tasks += 1 return # if not wait listed start the action for the task self.started_tasks += 1 self._start_task_execution(task_item) def _handle_completed_concurrency_item(self, task_item): self._logger.debug("Handling completed concurrency logic") # gets the concurrency key for the task concurrency_key = task_item[handlers.TASK_TR_CONCURRENCY_KEY] self._logger.debug("Handling completed task with ConcurrencyKey {}", concurrency_key) # use the concurrency key to decrement the counter for that key in the waiting list count = self._leave_waiting_list(task_item[handlers.TASK_TR_ID], concurrency_key) self._logger.debug("Concurrency count for ConcurrencyKey {} is {}", concurrency_key, count) self.finished_concurrency_tasks += 1 ResultNotifications(context=self._context, logger=self._logger).publish_ended(task_item) def _handle_finished_task_without_completion(self, task_item): ResultNotifications(context=self._context, logger=self._logger).publish_ended(task_item) def _handle_start_waiting_action(self, concurrency_item): self._logger.debug("Handling start waiting task logic") # gets the concurrency key for the task concurrency_id = concurrency_item[handlers.TASK_TR_CONCURRENCY_ID] self._logger.debug("Handling completed task with ConcurrencyId {}", concurrency_id) waiting_list = self.tracking_table.get_waiting_tasks(concurrency_id) self._logger.debug( " List of waiting tasks for ConcurrencyKey {} is {}", concurrency_id, safe_json(waiting_list, indent=3)) if len(waiting_list) > 0: count = concurrency_item.get(ACTIVE_INSTANCES, 0) oldest_waiting_task = sorted( waiting_list, key=lambda w: w[handlers.TASK_TR_CREATED_TS])[0] self._logger.debug(DEBUG_START_WAITING, concurrency_id, count, oldest_waiting_task[handlers.TASK_TR_ACTION], oldest_waiting_task[handlers.TASK_TR_NAME], oldest_waiting_task[handlers.TASK_TR_ID]) self.started_waiting_tasks += 1 self._start_task_execution(oldest_waiting_task) def _handle_check_completion(self, task_item): self._logger.debug("Handling test for completion logic") if task_item.get(handlers.TASK_TR_RUN_LOCAL, False) and not handlers.running_local(self._context): self._logger.debug("Item running in local mode skipped") return self.started_completion_checks += 1 self._start_task_execution( task_item=task_item, action=handlers.HANDLER_ACTION_TEST_COMPLETION) def _handle_deleted_item(self, task_item): if task_item.get(handlers.TASK_TR_S3_RESOURCES, False): bucket = os.getenv(handlers.ENV_RESOURCE_BUCKET) key = task_item[handlers.TASK_TR_ID] + ".json" try: self._logger.debug(DEBUG_DELETING_RESOURCES_FROM_S3, bucket, key) self.s3_client.delete_object_with_retries(Bucket=bucket, Key=key) except Exception as ex: self._logger.warning(WARN_DELETING_RESOURCES, bucket, key, ex) def handle_request(self): """ Handles the event triggered by updates to the actions tracking table. :return: results of handling selected updates """ def tasks_items_to_execute(): """ Generator function that selects all record items from the event that need processing. :return: """ def table_name(rec): source_arn = rec["eventSourceARN"] return source_arn.split("/")[1] def from_tracking_table(rec): return table_name(rec) == os.getenv( handlers.ENV_ACTION_TRACKING_TABLE) def from_concurrency_table(rec): return table_name(rec) == os.getenv( handlers.ENV_CONCURRENCY_TABLE) def get_old_image(task_record): return task_record["dynamodb"].get("OldImage", {}) def get_new_image(task_record): return task_record["dynamodb"].get("NewImage", {}) def get_new_status(task_record): return get_new_image(task_record).get(handlers.TASK_TR_STATUS, {}).get("S") def get_old_status(task_record): return get_new_image(task_record).get(handlers.TASK_TR_STATUS, {}).get("S") def is_task_tracking_table_update(task_record): if not from_tracking_table(task_record): return False return task_record["eventName"] in ["UPDATE", "MODIFY"] def is_task_done(task_record): if not is_task_tracking_table_update(task_record): return False new_status = get_new_status(task_record) old_status = get_old_status(task_record) if old_status != new_status: return False return new_status in handlers.task_tracking_table.NOT_LONGER_ACTIVE_STATUSES def is_task_with_concurrency(task_record): return get_new_image(task_record).get( handlers.TASK_TR_CONCURRENCY_KEY, {}).get("S") is not None def get_old_last_update(task_record): return get_old_image(task_record).get( handlers.TASK_TR_LAST_WAIT_COMPLETION, {}).get("S") def get_new_last_update(task_record): return get_new_image(task_record).get( handlers.TASK_TR_LAST_WAIT_COMPLETION, {}).get("S") def is_delete_task(task_record): return from_tracking_table( r) and task_record["eventName"] == "REMOVE" def is_new_task(task_record): if from_tracking_table( r) and task_record["eventName"] == "INSERT": return get_new_status( task_record) == handlers.STATUS_PENDING return False def is_completed_with_concurrency(task_record): return is_task_done(task_record) and is_task_with_concurrency( task_record) def is_completed_without_concurrency(task_record): return is_task_done( task_record) and not is_task_with_concurrency(task_record) def is_wait_for_completion(task_record): if not is_task_tracking_table_update(task_record): return False if get_old_status(task_record) != handlers.STATUS_WAIT_FOR_COMPLETION or \ get_new_status(task_record) != handlers.STATUS_WAIT_FOR_COMPLETION: return False return get_old_last_update(task_record) != get_new_last_update( task_record) def is_concurrency_task_completed(concurrency_record): if not from_concurrency_table(concurrency_record): return False if concurrency_record["eventName"] == "REMOVE": return False return concurrency_record["dynamodb"].get("NewImage", {}).get( "RunNext", {}).get("BOOL", False) def get_action_type(rec): if is_new_task(rec): return NEW_TASK if is_completed_without_concurrency(rec): return FINISHED_TASK if is_completed_with_concurrency(rec): return FINISHED_CONCURRENCY_TASK if is_wait_for_completion(rec): return CHECK_COMPLETION if is_delete_task(rec): return DELETE_ITEM if is_concurrency_task_completed(rec): return START_WAITING_ACTION return None for r in self._event.get("Records"): self._logger.debug("Record to process is {}", safe_json(r, indent=2)) if r.get("eventSource") == "aws:dynamodb": image_used = "NewImage" if "NewImage" in r[ "dynamodb"] else "OldImage" if r["dynamodb"].get("NewImage", {}).get(handlers.TASK_TR_ACTION) is None and \ r["dynamodb"].get("OldImage", {}).get(handlers.TASK_TR_ACTION) is not None: continue self._logger.debug_enabled = r["dynamodb"][image_used].get( handlers.TASK_TR_DEBUG, {}).get("BOOL", False) update_to_handle = get_action_type(r) if update_to_handle is not None: yield update_to_handle, r else: self._logger.debug("No action for record") try: start = datetime.now() task_handlers = [ self._handle_new_task_item, self._handle_finished_task_without_completion, self._handle_completed_concurrency_item, self._handle_check_completion, self._handle_deleted_item, self._handle_start_waiting_action ] for task_tracking_update_type, record in tasks_items_to_execute(): self.done_work = True used_image = "OldImage" if record[ "eventName"] == "REMOVE" else "NewImage" image = record["dynamodb"][used_image] handled_item = unpack_record(image) self._logger.debug_enabled = handled_item.get( handlers.TASK_TR_DEBUG, False) self._logger.debug( "Executing handler function {} for type {} ({})", task_handlers[task_tracking_update_type].__name__, self.task_string(task_tracking_update_type), task_tracking_update_type) task_handlers[task_tracking_update_type](handled_item) if not self.done_work: self._logger.clear() running_time = float((datetime.now() - start).total_seconds()) if self.done_work: self._logger.debug(DEBUG_RESULT, running_time) return safe_dict({ "datetime": datetime.now().isoformat(), "waiting-for-execution": self.waiting_for_execution_tasks, "started-check-for-completion": self.started_completion_checks, "started-execution": self.started_tasks, "started-waiting": self.started_waiting_tasks, "completed-concurrency-tasks": self.finished_concurrency_tasks, "running-time": running_time }) finally: self._logger.flush()
class ScheduleHandler(object): """ Class that handles time based events from CloudWatch rules """ def __init__(self, event, context): """ Initializes the instance. :param event: event to handle :param context: CLambda context """ self._context = context self._event = event self._table = None # Setup logging classname = self.__class__.__name__ dt = datetime.utcnow() logstream = LOG_STREAM.format(classname, dt.year, dt.month, dt.day) self._logger = QueuedLogger(logstream=logstream, buffersize=50, context=context) self.configuration_update = ScheduleHandler.is_config_update( self._event) if self.configuration_update: if "OldImage" in self._event["Records"][0]["dynamodb"]: self.updated_task = self._event["Records"][0]["dynamodb"][ "OldImage"][configuration.CONFIG_TASK_NAME]["S"] else: self.updated_task = self._event["Records"][0]["dynamodb"][ "NewImage"][configuration.CONFIG_TASK_NAME]["S"] self.execute_task_request = self.is_execute_event(self._event) self.executed_task_name = event.get( handlers.HANDLER_EVENT_TASK_NAME, "") if self.execute_task_request else None @classmethod def is_handling_request(cls, event, _): """ Tests if event is handled by instance of this handler. :param _: :param event: Tested event :return: True if the event is a cloudwatch rule event for scheduling or configuration update """ source = event.get(handlers.HANDLER_EVENT_SOURCE, "") if source == "aws.events": resources = event.get("resources", []) if len(resources) == 1 and resources[0].partition("/")[2].lower( ) == os.getenv(handlers.ENV_OPS_AUTOMATOR_RULE).lower(): return True return False return ScheduleHandler.is_config_update( event) or ScheduleHandler.is_execute_event(event) @staticmethod def is_config_update(event): if event.get("Records", [{}])[0].get("eventSource", "") != "aws:dynamodb": return False source_arn = event["Records"][0]["eventSourceARN"] table_name = source_arn.split("/")[1] return table_name == os.getenv(configuration.ENV_CONFIG_TABLE) @staticmethod def is_execute_event(event): return event.get( handlers.HANDLER_EVENT_ACTION, "") == handlers.HANDLER_EVENT_SCHEDULER_EXECUTE_TASK and event.get( handlers.HANDLER_EVENT_TASK_NAME, None) is not None @property def _last_run_table(self): """ Returns table to store last execution time for this handler. :return: table to store last execution time for this handler """ if self._table is None: self._table = boto3.resource('dynamodb').Table( os.environ[handlers.ENV_LAST_RUN_TABLE]) add_retry_methods_to_resource(self._table, ["get_item", "update_item"]) return self._table def _get_last_run(self): """ Returns the last UTC datetime this ops automator handler was executed. :return: Last datetime this handler was executed in timezone UTC """ # get from table resp = self._last_run_table.get_item_with_retries( Key={NAME_ATTR: LAST_SCHEDULER_RUN_KEY}, ConsistentRead=True) # test if item was in table if "Item" in resp: return dateutil.parser.parse(resp["Item"]["value"]).replace( second=0, microsecond=0) else: # default for first call is current datetime minus one minute return datetime.now(tz=pytz.timezone("UCT")).replace( second=0, microsecond=0) - timedelta(minutes=1) def _set_last_run(self): """ Stores and returns the current datetime in UTC as the last execution time of this handler. :return: Stored last execution time in UTC timezone """ dt = datetime.now(tz=pytz.timezone("UCT")).replace(second=0, microsecond=0) self._last_run_table.update_item( Key={NAME_ATTR: LAST_SCHEDULER_RUN_KEY}, AttributeUpdates={ "value": { "Action": "PUT", "Value": dt.isoformat() } }) return dt def handle_request(self): """ Handles the cloudwatch rule timer event :return: Started tasks, if any, information """ start = datetime.now() try: task_config = TaskConfiguration(context=self._context, logger=self._logger) if not self.execute_task_request: result = self.handle_scheduler_tasks(task_config) else: result = self.handle_execute_task_request(task_config) running_time = float((datetime.now() - start).total_seconds()) self._logger.info(INFO_RESULT, running_time) return result finally: self._logger.flush() def handle_scheduler_tasks(self, task_config): started_tasks = {} start = datetime.now() last_run_dt = self._get_last_run() self._logger.info(INFO_LAST_SAVED, last_run_dt.isoformat()) if self.configuration_update: self._logger.info(INFO_CONFIG_RUN, self.updated_task) current_dt = self._set_last_run() already_ran_this_minute = last_run_dt == current_dt if already_ran_this_minute and not (self.configuration_update or self.execute_task_request): self._logger.info(INFO_TASK_SCHEDULER_ALREADY_RAN) else: self._logger.info(INFO_CURRENT_SCHEDULING_DT, current_dt) task = None enabled_tasks = 0 next_executed_task = None utc = pytz.timezone("UTC") tasks = [ t for t in task_config.get_tasks() if t.get(handlers.TASK_INTERVAL) is not None and t.get(handlers.TASK_ENABLED, True) ] try: for task in tasks: enabled_tasks += 1 self._logger.debug_enabled = task[handlers.TASK_DEBUG] task_name = task[handlers.TASK_NAME] # timezone for specific task task_timezone = pytz.timezone(task[handlers.TASK_TIMEZONE]) # create cron expression to test if task needs te be executed task_cron_expression = CronExpression( expression=task[handlers.TASK_INTERVAL]) localized_last_run = last_run_dt.astimezone(task_timezone) localized_current_dt = current_dt.astimezone(task_timezone) next_execution = task_cron_expression.first_within_next( timedelta(hours=24), localized_current_dt) next_execution_utc = next_execution.astimezone( utc).replace(microsecond=0 ) if next_execution is not None else None if next_execution_utc is not None: if next_executed_task is None or next_execution_utc < next_executed_task[ 0]: next_executed_task = (next_execution_utc, task) if already_ran_this_minute: continue # test if task needs te be executed since last run of ops automator execute_dt_since_last = task_cron_expression.last_since( localized_last_run, localized_current_dt) if execute_dt_since_last is None: if next_execution is not None: next_execution = next_execution.astimezone( task_timezone) self._logger.info(INFO_NEXT_EXECUTION, task_name, next_execution.isoformat(), task_timezone) else: self._logger.info(INFO_NO_NEXT_WITHIN, task_name) continue self._logger.info(INFO_SCHEDULED_TASK, task_name, execute_dt_since_last, task_timezone, str(safe_json(task, indent=2))) # create an event for lambda function that starts execution by selecting for resources for this task task_group, sub_tasks = self._execute_task( task, execute_dt_since_last) started_tasks[task_name] = { "task-group": task_group, "sub-tasks": sub_tasks } if started_tasks: self._logger.info(INFO_STARTED_TASKS, enabled_tasks, ",".join(started_tasks)) else: self._logger.info(INFO_NO_TASKS_STARTED, enabled_tasks) self._set_next_schedule_event(current_dt, next_executed_task) running_time = float((datetime.now() - start).total_seconds()) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "event-datetime": current_dt.isoformat(), "enabled_tasks": enabled_tasks, "started-tasks": started_tasks }) except ValueError as ex: self._logger.error(ERR_SCHEDULE_HANDLER, ex, safe_json(task, indent=2)) def handle_execute_task_request(self, task_config): self._logger.info(INF_HANDLING_EXEC_REQUEST, self.executed_task_name) task_to_execute = task_config.get_task(name=self.executed_task_name) if task_to_execute is None: raise ValueError( "Task with name {} does not exists for stack {}".format( self.executed_task_name, os.getenv(handlers.ENV_STACK_NAME))) if not task_to_execute.get(handlers.TASK_ENABLED): raise ValueError("Task with name {} is not enabled", self.executed_task_name) task_group, sub_tasks = self._execute_task(task_to_execute) return safe_dict({ "datetime": datetime.now().isoformat(), "executed-task": self.executed_task_name, "task-group": task_group, "sub-tasks": sub_tasks }) def _set_next_schedule_event(self, scheduler_dt, next_executed_task): """ Sets the cron expression of the scheduler event rule in cloudwatch depending on next executed task :param scheduler_dt: dt used for this scheduler run :param next_executed_task: Next task to execute :return: """ if next_executed_task is not None: utc = pytz.timezone("UTC") time_str = "{} ({})".format(next_executed_task[0].isoformat(), utc) next_task_tz = pytz.timezone( next_executed_task[1][handlers.TASK_TIMEZONE]) if next_task_tz != utc: time_str += ", {} ({})".format( next_executed_task[0].astimezone(next_task_tz), next_task_tz) self._logger.info(INFO_NEXT_EXECUTED_TASK, next_executed_task[1][handlers.TASK_NAME], time_str) if next_executed_task[0] > scheduler_dt + timedelta(minutes=5): next_event_time = handlers.set_event_for_time( next_executed_task[0], task=next_executed_task[1], logger=self._logger, context=self._context) self._logger.info(INF_NEXT_EVENT, next_event_time.isoformat()) else: handlers.set_scheduler_rule_every_minute( task=next_executed_task[1]) self._logger.info(INFO_NEXT_ONE_MINUTE) else: self._logger.info(INFO_NO_TASKS_SCHEDULED) next_event_time = handlers.set_event_for_time( scheduler_dt, context=self._context, logger=self._logger) self._logger.info(INF_NEXT_EVENT, next_event_time.isoformat()) @staticmethod def task_account_region_sub_tasks(task): action_properties = actions.get_action_properties( task[handlers.TASK_ACTION]) aggregation_level = action_properties[actions.ACTION_AGGREGATION] # property may be a lambda function, call the function with parameters of task as lambda parameters if types.FunctionType == type(aggregation_level): aggregation_level = aggregation_level(task.get("parameters", {})) if aggregation_level == actions.ACTION_AGGREGATION_TASK: yield { handlers.TASK_THIS_ACCOUNT: task[handlers.TASK_THIS_ACCOUNT], handlers.TASK_ACCOUNTS: task[handlers.TASK_ACCOUNTS], handlers.TASK_REGIONS: task[handlers.TASK_REGIONS] } else: if task[handlers.TASK_THIS_ACCOUNT]: if aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT: yield { handlers.TASK_THIS_ACCOUNT: True, handlers.TASK_ACCOUNTS: [], handlers.TASK_REGIONS: task[handlers.TASK_REGIONS] } else: for region in task.get(handlers.TASK_REGIONS, [None]): yield { handlers.TASK_THIS_ACCOUNT: True, handlers.TASK_ACCOUNTS: [], handlers.TASK_REGIONS: [region] } for account in task.get(handlers.TASK_ACCOUNTS, []): if aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT: yield { handlers.TASK_THIS_ACCOUNT: False, handlers.TASK_ACCOUNTS: [account], handlers.TASK_REGIONS: task[handlers.TASK_REGIONS] } else: for region in task.get(handlers.TASK_REGIONS, [None]): yield { handlers.TASK_THIS_ACCOUNT: False, handlers.TASK_ACCOUNTS: [account], handlers.TASK_REGIONS: [region] } def _execute_task(self, task, dt=None, task_group=None): """ Execute a task by starting a lambda function that selects the resources for that action :param task: Task started :param dt: Task start datetime :return: """ debug_state = self._logger.debug_enabled self._logger.debug_enabled = task.get(handlers.TASK_DEBUG, False) if task_group is None: task_group = str(uuid.uuid4()) try: event = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.HANDLER_EVENT_TASK: task, handlers.HANDLER_EVENT_SOURCE: "scheduler-handler", handlers.HANDLER_EVENT_TASK_DT: dt.isoformat() if dt is not None else datetime.utcnow().isoformat(), handlers.HANDLER_EVENT_TASK_GROUP: task_group } sub_tasks = list( ScheduleHandler.task_account_region_sub_tasks(task)) for sub_task in sub_tasks: event[handlers.HANDLER_EVENT_SUB_TASK] = sub_task if not handlers.running_local(self._context): if task[handlers. TASK_SELECT_SIZE] != actions.ACTION_USE_ECS: # start lambda function to scan for task resources payload = str.encode(safe_json(event)) client = get_client_with_retries("lambda", ["invoke"], context=self._context) function_name = "{}-{}-{}".format( os.getenv(handlers.ENV_STACK_NAME), os.getenv(handlers.ENV_LAMBDA_NAME), task[handlers.TASK_SELECT_SIZE]) self._logger.info(INFO_RUNNING_LAMBDA, function_name) try: resp = client.invoke_with_retries( FunctionName=function_name, InvocationType="Event", LogType="None", Payload=payload) self._logger.debug(DEBUG_LAMBDA, resp["StatusCode"], payload) except Exception as ex: self._logger.error(ERR_FAILED_START_LAMBDA_TASK, str(ex)) else: ecs_args = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.TASK_NAME: task[handlers.TASK_NAME], handlers.HANDLER_EVENT_SUB_TASK: sub_task } ecs_memory = task.get(handlers.TASK_SELECT_ECS_MEMORY, None) self._logger.info(INFO_RUNNING_AS_ECS_JOB, task[handlers.TASK_NAME]) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, context=self._context, logger=self._logger) else: if task[handlers. TASK_SELECT_SIZE] == actions.ACTION_USE_ECS: ecs_args = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.TASK_NAME: task[handlers.TASK_NAME], handlers.HANDLER_EVENT_SUB_TASK: sub_task } ecs_memory = task.get(handlers.TASK_SELECT_ECS_MEMORY, None) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, logger=self._logger) else: # or if not running in lambda environment pass event to main task handler lambda_handler(event, self._context) return task_group, sub_tasks finally: self._logger.debug_enabled = debug_state
class SelectResourcesHandler(object): """ Class that handles the selection of AWS service resources for a task to perform its action on. """ def __init__(self, event, context, logger=None, tracking_store=None): def log_stream_name(): classname = self.__class__.__name__ dt = datetime.utcnow() account = self._event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}).get(handlers.HANDLER_EVENT_ACCOUNT, "") regions = self._event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}).get(handlers.HANDLER_EVENT_REGIONS, []) if account is not None and len(regions) > 0: account_and_region = "-".join([account, regions[0]]) + "-" else: region = "" if self.sub_task is not None: account = "" if self._this_account: if len(self._accounts) == 0: account = os.getenv( handlers.ENV_OPS_AUTOMATOR_ACCOUNT) elif len(self._accounts) == 1: account = self._accounts[0] region = self._regions[0] if len( self._regions) == 1 else "" if account != "": if region not in ["", None]: account_and_region = "-".join([account, region]) + "-" else: account_and_region = account else: account_and_region = "" return LOG_STREAM.format(classname, self.task[handlers.TASK_NAME], account_and_region, dt.year, dt.month, dt.day) self._context = context self._event = event self.task = event[handlers.HANDLER_EVENT_TASK] self.sub_task = event.get(handlers.HANDLER_EVENT_SUB_TASK, None) self.use_custom_select = event.get( handlers.HANDLER_EVENT_CUSTOM_SELECT, True) # the job id is used to correlate all generated tasks for the selected resources self.task_group = self._event.get(handlers.HANDLER_EVENT_TASK_GROUP, None) if self.task_group is None: self.task_group = str(uuid.uuid4()) debug = event[handlers.HANDLER_EVENT_TASK].get(handlers.TASK_DEBUG, False) if logger is None: self._logger = QueuedLogger(logstream=log_stream_name(), context=context, buffersize=50 if debug else 20, debug=debug) else: self._logger = logger self._sts = None self.select_args = event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}) self.task_dt = event[handlers.HANDLER_EVENT_TASK_DT] self.action_properties = actions.get_action_properties( self.task[handlers.TASK_ACTION]) self.action_class = actions.get_action_class( self.task[handlers.TASK_ACTION]) self.task_parameters = self.task.get(handlers.TASK_PARAMETERS, {}) self.metrics = self.task.get(handlers.TASK_METRICS, False) self.service = self.action_properties[actions.ACTION_SERVICE] self.keep_tags = self.action_properties.get( actions.ACTION_KEEP_RESOURCE_TAGS, True) self.source = self._event.get(handlers.HANDLER_EVENT_SOURCE, handlers.UNKNOWN_SOURCE) self.run_local = handlers.running_local(self._context) self._timer = None self._timeout_event = self._timeout_event = threading.Event() self.aggregation_level = self.action_properties.get( actions.ACTION_AGGREGATION, actions.ACTION_AGGREGATION_RESOURCE) if self.aggregation_level is not None and isinstance( self.aggregation_level, types.FunctionType): self.aggregation_level = self.aggregation_level( self.task_parameters) self.batch_size = self.action_properties.get(actions.ACTION_BATCH_SIZE) if self.batch_size is not None and isinstance(self.batch_size, types.FunctionType): self.batch_size = self.batch_size(self.task_parameters) self.actions_tracking = TaskTrackingTable( self._context, logger=self._logger) if tracking_store is None else tracking_store @classmethod def is_handling_request(cls, event, _): """ Tests if this handler handles the event. :param _: :param event: The event tyo test :return: True if the event is handled by this handler """ return event.get(handlers.HANDLER_EVENT_ACTION, "") == handlers.HANDLER_ACTION_SELECT_RESOURCES @property def _task_tag(self): """ Returns the name of the tag that contains the list of actions for a resource. :return: The name of the tag that contains the list of actions for a resource """ name = os.environ.get(handlers.ENV_AUTOMATOR_TAG_NAME) if name is None: name = handlers.DEFAULT_SCHEDULER_TAG return name @property def sts(self): if self._sts is None: self._sts = boto3.client("sts") return self._sts @property def _resource_name(self): name = self.action_properties[actions.ACTION_RESOURCES] if name in [None, ""]: name = self._event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}).get( handlers.HANDLER_EVENT_RESOURCE_NAME, "") return name def _check_can_execute(self, selected_resources): """ Checks if the action for the task can be executed with the selected resources :param selected_resources: :return: """ check_method = getattr(self.action_class, actions.CHECK_CAN_EXECUTE, None) if check_method: try: check_method(selected_resources, self.task_parameters) return True except ValueError as ex: self._logger.error(ERR_CAN_NOT_EXECUTE_WITH_THESE_RESOURCES, self.task[handlers.TASK_ACTION], self.task[handlers.TASK_NAME], str(ex)) return False return True def _task_assumed_roles(self): """ Returns a list of service instances for each handled account/role :return: """ # account can optionally be passed in by events account = self._event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}).get(handlers.HANDLER_EVENT_ACCOUNT) if account is not None: assumed_role = handlers.get_account_role(account=account, task=self.task, logger=self._logger) if assumed_role is None: if account != os.getenv(handlers.ENV_OPS_AUTOMATOR_ACCOUNT): self._logger.error(ERR_ACCOUNT_SKIPPED_NO_ROLE, account) yield None else: yield assumed_role else: # no role if processing scheduled task in own account if self._this_account: assumed_role = handlers.get_account_role(account=os.getenv( handlers.ENV_OPS_AUTOMATOR_ACCOUNT), task=self.task, logger=self._logger) yield assumed_role for acct in self._accounts: # for external accounts assumed_role = handlers.get_account_role(account=acct, task=self.task, logger=self._logger) if assumed_role is not None: yield assumed_role @property def _this_account(self): if self.sub_task is not None: return self.sub_task[handlers.TASK_THIS_ACCOUNT] return self.task.get(handlers.TASK_THIS_ACCOUNT, True) @property def _accounts(self): if self.sub_task is not None: return self.sub_task[handlers.TASK_ACCOUNTS] return self.task.get(handlers.TASK_ACCOUNTS, []) @property def _regions(self): """ Returns the regions in where resources are selected :return: """ regions = self._event.get(handlers.HANDLER_SELECT_ARGUMENTS, {}).get(handlers.HANDLER_EVENT_REGIONS) if regions is None: regions = self.sub_task[ handlers. TASK_REGIONS] if self.sub_task is not None else self.task.get( handlers.TASK_REGIONS, [None]) else: # check if the regions in the event are in the task configurations regions checked_regions = [r for r in regions if r in regions] if len(checked_regions) != len(regions): self._logger.warning(WARN_REGION_NOT_IN_TASK_CONFIGURATION, self._event) return checked_regions return regions if len(regions) > 0 else [None] def handle_request(self): """ Handles the select resources request. Creates new actions for resources found for a task :return: Results of handling the request """ def filter_by_action_filter(srv, used_role, r): filter_method = getattr(self.action_class, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, None) if filter_method is not None: r = filter_method(srv, self._logger, self._resource_name, r, self._context, self.task, used_role) if r is None: self._logger.debug( DEBUG_FILTER_METHOD, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD) return None else: self._logger.debug( DEBUG_FILTERED_RESOURCE, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, safe_json(r, indent=3)) return r def is_selected_resource(aws_service, resource, used_role, taskname, tags_filter, does_resource_supports_tags): # No tags then just use filter method if any if not does_resource_supports_tags: self._logger.debug(DEBUG_RESOURCE_NO_TAGS, resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) tags = resource.get("Tags", {}) # name of the tag that holds the list of tasks for this resource tagname = self._task_tag if tags_filter is None: # test if name of the task is in list of tasks in tag value if (tagname not in tags) or (taskname not in tagging.split_task_list( tags[tagname])): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED, safe_json(resource, indent=2), taskname, ','.join( ["'{}'='{}'".format(t, tags[t]) for t in tags])) return None self._logger.debug(DEBUG_SELECTED_BY_TASK_NAME_IN_TAG_VALUE, safe_json(resource, indent=2), tagname, taskname) else: # using a tag filter, * means any tag if tags_filter != tagging.tag_filter_set.WILDCARD_CHAR: # test if there are any tags matching the tag filter if not TagFilterExpression(tags_filter).is_match(tags): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED_TAG_FILTER, safe_json(resource, indent=2), taskname, ','.join([ "'{}'='{}'".format(t, tags[t]) for t in tags ])) return None self._logger.debug(DEBUG_SELECTED_BY_TAG_FILTER, safe_json(resource, indent=2), tags, tag_filter_str, taskname) else: self._logger.debug(DEBUG_SELECTED_WILDCARD_TAG_FILTER, safe_json(resource, indent=2), taskname) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) def resource_batches(resources): """ Returns resources as chunks of size items. If the class has an optional custom aggregation function then the resources are aggregated first using this function before applying the batch size :param resources: resources to process :return: Generator for blocks of resource items """ aggregate_func = getattr(self.action_class, actions.CUSTOM_AGGREGATE_METHOD, None) for i in aggregate_func( resources, self.task_parameters, self._logger) if aggregate_func is not None else [ resources ]: if self.batch_size is None: yield i else: first = 0 while first < len(i): yield i[first:first + self.batch_size] first += self.batch_size def setup_tag_filtering(t_name): # get optional tag filter no_select_by_tags = self.action_properties.get( actions.ACTION_NO_TAG_SELECT, False) if no_select_by_tags: tag_filter_string = tagging.tag_filter_set.WILDCARD_CHAR else: tag_filter_string = self.task.get(handlers.TASK_TAG_FILTER) # set if only a single task is required for selecting the resources, it is used to optimise the select select_tag = None if tag_filter_string is None: self._logger.debug(DEBUG_SELECT_BY_TASK_NAME, self._resource_name, self._task_tag, t_name) select_tag = self._task_tag elif tag_filter_string == tagging.tag_filter_set.WILDCARD_CHAR: self._logger.debug(DEBUG_SELECT_ALL_RESOURCES, self._resource_name) else: self._logger.debug(DEBUG_TAG_FILTER_USED_TO_SELECT_RESOURCES, self._resource_name) # build the tag expression that us used to filter the resources tag_filter_expression = TagFilterExpression(tag_filter_string) # the keys of the used tags tag_filter_expression_tag_keys = list( tag_filter_expression.get_filter_keys()) # if there is only a single tag then we can optimize by just filtering on that specific tag if len(tag_filter_expression_tag_keys) == 1 and \ tagging.tag_filter_set.WILDCARD_CHAR not in tag_filter_expression_tag_keys[0]: select_tag = tag_filter_expression_tag_keys[0] return select_tag, tag_filter_string def add_aggregated(aggregated_resources): # create tasks action for aggregated resources , optionally split in batch size chunks for ra in resource_batches(aggregated_resources): if self._check_can_execute(ra): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ra, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADDED_AGGREGATED_RESOURCES_TASK, action_item[handlers.TASK_TR_ID], len(ra), self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item def add_as_individual(resources): for ri in resources: # task action for each selected resource if self._check_can_execute([ri]): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ri, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADD_SINGLE_RESOURCE_TASK, action_item[handlers.TASK_TR_ID], self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item try: task_items = [] start = datetime.now() self._logger.debug(DEBUG_EVENT, safe_json(self._event, indent=3)) self._logger.debug(DEBUG_ACTION, safe_json(self.action_properties, indent=3)) self._logger.info(INFO_SELECTED_RESOURCES, self._resource_name, self.service, self.task[handlers.TASK_NAME]) self._logger.info(INFO_AGGR_LEVEL, self.aggregation_level) task_level_aggregated_resources = [] args = self._build_describe_argument() service_resource_with_tags = services.create_service( self.service).resources_with_tags if self._resource_name == "": supports_tags = len(service_resource_with_tags) != 0 else: supports_tags = self._resource_name.lower() in [ r.lower() for r in service_resource_with_tags ] args["tags"] = supports_tags self._logger.info(INFO_USE_TAGS_TO_SELECT, "R" if supports_tags else "No r") task_name = self.task[handlers.TASK_NAME] count_resource_items = 0 selected_resource_items = 0 select_on_tag, tag_filter_str = setup_tag_filtering(task_name) filter_func = getattr(self.action_class, actions.FILTER_RESOURCE_METHOD, None) # timer to guard selection time and log warning if getting close to lambda timeout if self._context is not None: self.start_timer(REMAINING_TIME_AFTER_DESCRIBE) try: for assumed_role in self._task_assumed_roles(): retry_strategy = get_default_retry_strategy( service=self.service, context=self._context) service = services.create_service( service_name=self.service, service_retry_strategy=retry_strategy, role_arn=assumed_role) if self.is_timed_out(): break # contains resources for account account_level_aggregated_resources = [] self._logger.info(INFO_ACCOUNT, service.aws_account) if assumed_role not in [None, ""]: self._logger.info(INFO_ASSUMED_ROLE, assumed_role) for region in self._regions: # test for timeouts if self.is_timed_out(): break # handle region passed in the event if region is not None: args["region"] = region else: if "region" in args: del args["region"] # resources can be passed in the invent by event handlers all_resources = self._event.get( handlers.HANDLER_SELECT_RESOURCES, None) if all_resources is None: # actions can have an optional method to select resources action_custom_describe_function = getattr( self.action_class, "describe_resources", None) if action_custom_describe_function is not None and self.use_custom_select: all_resources = action_custom_describe_function( service, self.task, region) else: # select resources from the service self._logger.debug(DEBUG_SELECT_PARAMETERS, self._resource_name, self.service, args) # selecting a list of all resources in this account/region all_resources = list( service.describe( self._resource_name, filter_func=filter_func, select_on_tag=select_on_tag, **args)) # test for timeout if self.is_timed_out(): break count_resource_items += len(all_resources) self._logger.info(INFO_RESOURCES_FOUND, len(all_resources)) # select resources that are processed by the task selected_resources = [] for sr in all_resources: sel = is_selected_resource( aws_service=service, resource=sr, used_role=assumed_role, taskname=task_name, tags_filter=tag_filter_str, does_resource_supports_tags=supports_tags) if sel is not None: selected_resources.append(sel) selected_resource_items += len(selected_resources) # display found and selected resources if len(all_resources) > 0: self._logger.info(INFO_RESOURCES_SELECTED, len(selected_resources)) if len(selected_resources) == 0: continue # delete tags if not needed by the action if not self.keep_tags: for res in selected_resources: if "Tags" in res: del res["Tags"] # add resources to total list of resources for this task if self.aggregation_level == actions.ACTION_AGGREGATION_TASK: task_level_aggregated_resources += selected_resources # add resources to list of resources for this account if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT: account_level_aggregated_resources += selected_resources # add batch(es) of resources for this region if self.aggregation_level == actions.ACTION_AGGREGATION_REGION and len( selected_resources) > 0: task_items += list( add_aggregated(selected_resources)) # no aggregation, add each individual resource if self.aggregation_level == actions.ACTION_AGGREGATION_RESOURCE and len( selected_resources) > 0: task_items += list( add_as_individual(selected_resources)) # at the end of the region loop, check if aggregated resources for account need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT and len( account_level_aggregated_resources) > 0: task_items += list( add_aggregated(account_level_aggregated_resources)) # at the end of the accounts loop, check if aggregated resources for task need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_TASK and len( task_level_aggregated_resources) > 0: task_items += list( add_aggregated(task_level_aggregated_resources)) except Exception as ex: raise_exception(ERR_SELECTING_TASK_RESOURCES, self.task[handlers.TASK_NAME], ex) finally: if self._timer is not None: # cancel time used avoid timeouts when selecting resources self._timer.cancel() if self.is_timed_out(): raise_exception(ERR_TIMEOUT_SELECTING_RESOURCES, self._resource_name, self.service, task_name) self.start_timer(REMAINING_TIME_AFTER_STORE) self.actions_tracking.flush(self._timeout_event) if self.is_timed_out(): raise_exception( ERR_CREATING_TASKS_FOR_SELECTED_RESOURCES, task_name) self._timer.cancel() else: self.actions_tracking.flush() self._logger.info(INFO_ADDED_ITEMS, len(task_items), self.task[handlers.TASK_NAME]) running_time = float((datetime.now() - start).total_seconds()) self._logger.info(INFO_RESULT, running_time) if self.metrics: put_task_select_data(task_name=task_name, items=count_resource_items, selected_items=selected_resource_items, logger=self._logger, selection_time=running_time) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "dispatched-tasks": task_items }) finally: self._logger.flush() def select_timed_out(self): """ Function is called when the handling of the request times out :return: """ time_used = int(os.getenv(handlers.ENV_LAMBDA_TIMEOUT, 900)) - int( (self._context.get_remaining_time_in_millis() / 1000)) self._logger.error(ERR_TIMEOUT_SELECT_OR_STORE, time_used, self.task[handlers.TASK_NAME]) self._timeout_event.set() self._logger.flush() self._timer.cancel() def start_timer(self, remaining): execution_time_left = (self._context.get_remaining_time_in_millis() / 1000.00) - remaining self._timer = threading.Timer(execution_time_left, self.select_timed_out) self._timer.start() def is_timed_out(self): return self._timeout_event is not None and self._timeout_event.is_set() def _build_describe_argument(self): """ Build the argument for the describe call that selects the resources :return: arguments for describe call """ args = {} # get the mapping for parameters that should be used as parameters to the describe method call to select the resources action_parameters = self.action_properties.get( actions.ACTION_PARAMETERS, {}) for p in [ p for p in action_parameters if action_parameters[p].get( actions.PARAM_DESCRIBE_PARAMETER) is not None ]: if self.task_parameters.get(p) is not None: args[action_parameters[p] [actions. PARAM_DESCRIBE_PARAMETER]] = self.task_parameters[p] # also add describe method parameters specified as select parameters in the metadata of the action select_parameters = self.action_properties.get( actions.ACTION_SELECT_PARAMETERS, {}) if types.FunctionType == type(select_parameters): select_parameters = select_parameters(self.task, self.task_parameters) for p in select_parameters: args[p] = select_parameters[p] # region and account are separate describe parameters args.update({ a: self.select_args[a] for a in self.select_args if a not in [ handlers.HANDLER_EVENT_REGIONS, handlers.HANDLER_EVENT_ACCOUNT, handlers.HANDLER_EVENT_RESOURCE_NAME ] }) # action specified select jmes-path expression for resources if actions.ACTION_SELECT_EXPRESSION in self.action_properties: # replace parameter placeholders with values. We cant use str.format here are the jmespath expression may contain {} # as well for projection of attributes, so the use placeholders for parameter names in format %paramname% jmes = self.action_properties[actions.ACTION_SELECT_EXPRESSION] for p in self.task_parameters: jmes = jmes.replace("%{}%".format(p), str(self.task_parameters[p])) args["select"] = jmes return args
class ConfigurationResourceHandler(CustomResource): def __init__(self, event, context): CustomResource.__init__(self, event, context) self.arguments = copy(self.resource_properties) self.arguments = { a: self.resource_properties[a] for a in self.resource_properties if a not in ["ServiceToken", "Timeout"] } # setup logging dt = datetime.utcnow() classname = self.__class__.__name__ logstream = LOG_STREAM.format(classname, dt.year, dt.month, dt.day) self._logger = QueuedLogger(logstream=logstream, context=context, buffersize=20) @classmethod def is_handling_request(cls, event, _): return event.get("StackId") is not None and event.get( "ResourceType") == "Custom::TaskConfig" def handle_request(self): start = datetime.now() self._logger.info("Cloudformation request is {}", safe_json(self._event, indent=2)) try: result = CustomResource.handle_request(self) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": (datetime.now() - start).total_seconds(), "result": result }) finally: self._logger.flush() def _create_request(self): name = self.resource_properties[CONFIG_TASK_NAME] try: self._logger.info("Creating new Task resource with name {}", name) self.physical_resource_id = name self.task = create_task(**self.arguments) self._logger.info( "Created new resource with physical resource id {}", self.physical_resource_id) return True except Exception as ex: self.response["Reason"] = str(ex) self._logger.error(ERR_CREATING_TASK_, name, ex) return False def _update_request(self): self._logger.info("Updating Task resource") name = self.resource_properties.get(CONFIG_TASK_NAME) try: if name is None: raise_exception(ERR_NO_TASK_NAME_RESOURCE_PROPERTY) if name != self.physical_resource_id: self._logger.info( "Name change for resource with physical resource id {}, new value is {}", name, self.physical_resource_id) self.arguments[CONFIG_TASK_NAME] = name create_task(**self.arguments) self.physical_resource_id = name self._logger.info( "Created new resource with physical resource id {}", self.physical_resource_id) else: update_task(name, **self.arguments) self._logger.info( "Updated resource with physical resource id {}", self.physical_resource_id) return True except Exception as ex: self.response["Reason"] = str(ex) self._logger.error(ERR_UPDATING_TASK, name, ex) return False def _delete_request(self): self._logger.info("Deleting Task resource") name = self.resource_properties.get(CONFIG_TASK_NAME) try: self._logger.info("Task name is {}, physical resource id is {}", name, self.physical_resource_id) # as the task can be part of a different stack than the scheduler that owns the configuration table the table could # be deleted by that stack, so first check if the table still exists if TaskConfiguration.config_table_exists(): delete_task(self.physical_resource_id) self._logger.info( "Deleted resource {} with physical resource id {}", name, self.physical_resource_id) else: self._logger.info( "Configuration table does not longer exist so deletion of item skipped" ) return True except Exception as ex: self.response["Reason"] = str(ex) self._logger.error(ERR_DELETING_TASK, name, ex) return False
class CompletionHandler(object): """ Class that handles time based events from CloudWatch rules """ def __init__(self, event, context): """ Initializes the instance. :param event: event to handle :param context: Lambda context """ self._context = context self._event = event self._table = None # Setup logging classname = self.__class__.__name__ dt = datetime.utcnow() logstream = LOG_STREAM.format(classname, dt.year, dt.month, dt.day) self._logger = QueuedLogger(logstream=logstream, buffersize=20, context=context) @classmethod def is_handling_request(cls, event, _): """ Tests if event is handled by instance of this handler. :param _: :param event: Tested event :return: True if the event is a cloudwatch rule event for task completion """ source = event.get(handlers.HANDLER_EVENT_SOURCE, "") if source != "aws.events": return False resources = event.get("resources", []) if len(resources) == 1 and resources[0].partition("/")[2].lower( ) == os.getenv(handlers.ENV_COMPLETION_RULE).lower(): return True return False def handle_request(self): """ Handles the cloudwatch rule timer event :return: Started tasks, if any, information """ try: start = datetime.now() count = 0 tracking_table = TaskTrackingTable(context=self._context, logger=self._logger) for task in tracking_table.get_tasks_to_check_for_completion(): count += 1 task_id = task[handlers.TASK_TR_ID] last_check_for_completion_time = datetime.now().isoformat() tracking_table.update_task( task_id, task=task.get(handlers.TASK_TR_NAME, None), task_metrics=task.get(handlers.TASK_TR_METRICS, False), status_data={ handlers.TASK_TR_LAST_WAIT_COMPLETION: last_check_for_completion_time }) self._logger.debug("Task is {}", task) self._logger.info(INF_SET_COMPLETION_TASK_TIMER, task.get(handlers.TASK_TR_NAME, None), task_id, last_check_for_completion_time) running_time = float((datetime.now() - start).total_seconds()) self._logger.info(INF_COMPLETION_ITEMS_SET, running_time, count) if count == 0 and not handlers.running_local(self._context): rule = handlers.disable_completion_cloudwatch_rule( self._context) self._logger.info(INF_DISABLED_COMPLETION_TIMER, rule) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "tasks-to-check": count }) except ValueError as ex: self._logger.error(ERR_COMPLETION_HANDLER, ex, safe_json(self._event, indent=2)) finally: self._logger.flush()