def handle_request(self): """ Handles the select resources request. Creates new actions for resources found for a task :return: Results of handling the request """ def filter_by_action_filter(srv, used_role, r): filter_method = getattr(self.action_class, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, None) if filter_method is not None: r = filter_method(srv, self._logger, self._resource_name, r, self._context, self.task, used_role) if r is None: self._logger.debug( DEBUG_FILTER_METHOD, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD) return None else: self._logger.debug( DEBUG_FILTERED_RESOURCE, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, safe_json(r, indent=3)) return r def is_selected_resource(aws_service, resource, used_role, taskname, tags_filter, does_resource_supports_tags): # No tags then just use filter method if any if not does_resource_supports_tags: self._logger.debug(DEBUG_RESOURCE_NO_TAGS, resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) tags = resource.get("Tags", {}) # name of the tag that holds the list of tasks for this resource tagname = self._task_tag if tags_filter is None: # test if name of the task is in list of tasks in tag value if (tagname not in tags) or (taskname not in tagging.split_task_list( tags[tagname])): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED, safe_json(resource, indent=2), taskname, ','.join( ["'{}'='{}'".format(t, tags[t]) for t in tags])) return None self._logger.debug(DEBUG_SELECTED_BY_TASK_NAME_IN_TAG_VALUE, safe_json(resource, indent=2), tagname, taskname) else: # using a tag filter, * means any tag if tags_filter != tagging.tag_filter_set.WILDCARD_CHAR: # test if there are any tags matching the tag filter if not TagFilterExpression(tags_filter).is_match(tags): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED_TAG_FILTER, safe_json(resource, indent=2), taskname, ','.join([ "'{}'='{}'".format(t, tags[t]) for t in tags ])) return None self._logger.debug(DEBUG_SELECTED_BY_TAG_FILTER, safe_json(resource, indent=2), tags, tag_filter_str, taskname) else: self._logger.debug(DEBUG_SELECTED_WILDCARD_TAG_FILTER, safe_json(resource, indent=2), taskname) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) def resource_batches(resources): """ Returns resources as chunks of size items. If the class has an optional custom aggregation function then the resources are aggregated first using this function before applying the batch size :param resources: resources to process :return: Generator for blocks of resource items """ aggregate_func = getattr(self.action_class, actions.CUSTOM_AGGREGATE_METHOD, None) for i in aggregate_func( resources, self.task_parameters, self._logger) if aggregate_func is not None else [ resources ]: if self.batch_size is None: yield i else: first = 0 while first < len(i): yield i[first:first + self.batch_size] first += self.batch_size def setup_tag_filtering(t_name): # get optional tag filter no_select_by_tags = self.action_properties.get( actions.ACTION_NO_TAG_SELECT, False) if no_select_by_tags: tag_filter_string = tagging.tag_filter_set.WILDCARD_CHAR else: tag_filter_string = self.task.get(handlers.TASK_TAG_FILTER) # set if only a single task is required for selecting the resources, it is used to optimise the select select_tag = None if tag_filter_string is None: self._logger.debug(DEBUG_SELECT_BY_TASK_NAME, self._resource_name, self._task_tag, t_name) select_tag = self._task_tag elif tag_filter_string == tagging.tag_filter_set.WILDCARD_CHAR: self._logger.debug(DEBUG_SELECT_ALL_RESOURCES, self._resource_name) else: self._logger.debug(DEBUG_TAG_FILTER_USED_TO_SELECT_RESOURCES, self._resource_name) # build the tag expression that us used to filter the resources tag_filter_expression = TagFilterExpression(tag_filter_string) # the keys of the used tags tag_filter_expression_tag_keys = list( tag_filter_expression.get_filter_keys()) # if there is only a single tag then we can optimize by just filtering on that specific tag if len(tag_filter_expression_tag_keys) == 1 and \ tagging.tag_filter_set.WILDCARD_CHAR not in tag_filter_expression_tag_keys[0]: select_tag = tag_filter_expression_tag_keys[0] return select_tag, tag_filter_string def add_aggregated(aggregated_resources): # create tasks action for aggregated resources , optionally split in batch size chunks for ra in resource_batches(aggregated_resources): if self._check_can_execute(ra): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ra, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADDED_AGGREGATED_RESOURCES_TASK, action_item[handlers.TASK_TR_ID], len(ra), self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item def add_as_individual(resources): for ri in resources: # task action for each selected resource if self._check_can_execute([ri]): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ri, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADD_SINGLE_RESOURCE_TASK, action_item[handlers.TASK_TR_ID], self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item try: task_items = [] start = datetime.now() self._logger.debug(DEBUG_EVENT, safe_json(self._event, indent=3)) self._logger.debug(DEBUG_ACTION, safe_json(self.action_properties, indent=3)) self._logger.info(INFO_SELECTED_RESOURCES, self._resource_name, self.service, self.task[handlers.TASK_NAME]) self._logger.info(INFO_AGGR_LEVEL, self.aggregation_level) task_level_aggregated_resources = [] args = self._build_describe_argument() service_resource_with_tags = services.create_service( self.service).resources_with_tags if self._resource_name == "": supports_tags = len(service_resource_with_tags) != 0 else: supports_tags = self._resource_name.lower() in [ r.lower() for r in service_resource_with_tags ] args["tags"] = supports_tags self._logger.info(INFO_USE_TAGS_TO_SELECT, "R" if supports_tags else "No r") task_name = self.task[handlers.TASK_NAME] count_resource_items = 0 selected_resource_items = 0 select_on_tag, tag_filter_str = setup_tag_filtering(task_name) filter_func = getattr(self.action_class, actions.FILTER_RESOURCE_METHOD, None) # timer to guard selection time and log warning if getting close to lambda timeout if self._context is not None: self.start_timer(REMAINING_TIME_AFTER_DESCRIBE) try: for assumed_role in self._task_assumed_roles(): retry_strategy = get_default_retry_strategy( service=self.service, context=self._context) service = services.create_service( service_name=self.service, service_retry_strategy=retry_strategy, role_arn=assumed_role) if self.is_timed_out(): break # contains resources for account account_level_aggregated_resources = [] self._logger.info(INFO_ACCOUNT, service.aws_account) if assumed_role not in [None, ""]: self._logger.info(INFO_ASSUMED_ROLE, assumed_role) for region in self._regions: # test for timeouts if self.is_timed_out(): break # handle region passed in the event if region is not None: args["region"] = region else: if "region" in args: del args["region"] # resources can be passed in the invent by event handlers all_resources = self._event.get( handlers.HANDLER_SELECT_RESOURCES, None) if all_resources is None: # actions can have an optional method to select resources action_custom_describe_function = getattr( self.action_class, "describe_resources", None) if action_custom_describe_function is not None and self.use_custom_select: all_resources = action_custom_describe_function( service, self.task, region) else: # select resources from the service self._logger.debug(DEBUG_SELECT_PARAMETERS, self._resource_name, self.service, args) # selecting a list of all resources in this account/region all_resources = list( service.describe( self._resource_name, filter_func=filter_func, select_on_tag=select_on_tag, **args)) # test for timeout if self.is_timed_out(): break count_resource_items += len(all_resources) self._logger.info(INFO_RESOURCES_FOUND, len(all_resources)) # select resources that are processed by the task selected_resources = [] for sr in all_resources: sel = is_selected_resource( aws_service=service, resource=sr, used_role=assumed_role, taskname=task_name, tags_filter=tag_filter_str, does_resource_supports_tags=supports_tags) if sel is not None: selected_resources.append(sel) selected_resource_items += len(selected_resources) # display found and selected resources if len(all_resources) > 0: self._logger.info(INFO_RESOURCES_SELECTED, len(selected_resources)) if len(selected_resources) == 0: continue # delete tags if not needed by the action if not self.keep_tags: for res in selected_resources: if "Tags" in res: del res["Tags"] # add resources to total list of resources for this task if self.aggregation_level == actions.ACTION_AGGREGATION_TASK: task_level_aggregated_resources += selected_resources # add resources to list of resources for this account if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT: account_level_aggregated_resources += selected_resources # add batch(es) of resources for this region if self.aggregation_level == actions.ACTION_AGGREGATION_REGION and len( selected_resources) > 0: task_items += list( add_aggregated(selected_resources)) # no aggregation, add each individual resource if self.aggregation_level == actions.ACTION_AGGREGATION_RESOURCE and len( selected_resources) > 0: task_items += list( add_as_individual(selected_resources)) # at the end of the region loop, check if aggregated resources for account need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT and len( account_level_aggregated_resources) > 0: task_items += list( add_aggregated(account_level_aggregated_resources)) # at the end of the accounts loop, check if aggregated resources for task need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_TASK and len( task_level_aggregated_resources) > 0: task_items += list( add_aggregated(task_level_aggregated_resources)) except Exception as ex: raise_exception(ERR_SELECTING_TASK_RESOURCES, self.task[handlers.TASK_NAME], ex) finally: if self._timer is not None: # cancel time used avoid timeouts when selecting resources self._timer.cancel() if self.is_timed_out(): raise_exception(ERR_TIMEOUT_SELECTING_RESOURCES, self._resource_name, self.service, task_name) self.start_timer(REMAINING_TIME_AFTER_STORE) self.actions_tracking.flush(self._timeout_event) if self.is_timed_out(): raise_exception( ERR_CREATING_TASKS_FOR_SELECTED_RESOURCES, task_name) self._timer.cancel() else: self.actions_tracking.flush() self._logger.info(INFO_ADDED_ITEMS, len(task_items), self.task[handlers.TASK_NAME]) running_time = float((datetime.now() - start).total_seconds()) self._logger.info(INFO_RESULT, running_time) if self.metrics: put_task_select_data(task_name=task_name, items=count_resource_items, selected_items=selected_resource_items, logger=self._logger, selection_time=running_time) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "dispatched-tasks": task_items }) finally: self._logger.flush()
def _start_task_execution(self, task_item, action=handlers.HANDLER_ACTION_EXECUTE): """ Creates an instance of the lambda function that executes the tasks action. It first checks is the action has specific memory requirements and based on this it creates a copy of this instance or one configured for the required memory. All information for executing the action is passed in the event. :param task_item: Task item for which action is executed :return: """ try: self._logger.debug( "Entering start_task_execution ({}) with task {}", action, safe_json(task_item, indent=3)) # Create event for execution of the action and set its action so that is picked up by the execution handler event = {i: task_item.get(i) for i in task_item} event[handlers.HANDLER_EVENT_ACTION] = action self._logger.debug(DEBUG_ACTION, task_item[handlers.TASK_TR_ACTION], task_item[handlers.TASK_TR_NAME], task_item[handlers.TASK_TR_ID]) self._logger.debug( DEBUG_ACTION_PARAMETERS, safe_json(task_item.get(handlers.TASK_TR_PARAMETERS, {}), indent=3)) # get memory allocation for executing the task lambda_size = handlers.TASK_TR_COMPLETION_SIZE \ if action == handlers.HANDLER_ACTION_TEST_COMPLETION \ else handlers.TASK_TR_EXECUTE_SIZE execute_lambda_size = task_item.get(lambda_size, actions.ACTION_SIZE_STANDARD) if execute_lambda_size == actions.ACTION_USE_ECS: ecs_memory = task_item.get( handlers.TASK_EXECUTE_ECS_MEMORY if action == handlers.HANDLER_ACTION_EXECUTE else handlers.TASK_COMPLETION_ECS_MEMORY, None) else: ecs_memory = None if not handlers.running_local(self._context): self._logger.debug(DEBUG_MEMORY_SIZE, execute_lambda_size) if execute_lambda_size != actions.ACTION_USE_ECS: # create event payload payload = str.encode(safe_json(event)) # determine which lambda to execute on function_name = "{}-{}-{}".format( os.getenv(handlers.ENV_STACK_NAME), os.getenv(handlers.ENV_LAMBDA_NAME), execute_lambda_size) self._logger.debug( "Running execution of task on lambda function {}", function_name) self._logger.debug(DEBUG_LAMBDA_FUNCTION_, function_name, payload) # start lambda function lambda_client = boto_retry.get_client_with_retries( "lambda", ["invoke"], context=self._context, logger=self._logger) resp = lambda_client.invoke_with_retries( FunctionName=function_name, InvocationType="Event", LogType="None", Payload=payload) task_info = { "id": task_item[handlers.TASK_TR_ID], "task": task_item[handlers.TASK_TR_NAME], "action": task_item[handlers.TASK_TR_ACTION], "payload": payload, "status-code": resp["StatusCode"] } self._logger.debug(DEBUG_LAMBDA, safe_json(task_info, indent=2)) self.invoked_lambda_functions.append(task_info) else: # run as ECS job ecs_args = { handlers.HANDLER_EVENT_ACTION: action, handlers.TASK_NAME: task_item[handlers.TASK_TR_NAME], handlers.TASK_TR_ID: task_item[handlers.TASK_TR_ID] } self._logger.debug(DEBUG_RUNNING_ECS_TASK, action, task_item[handlers.TASK_TR_NAME]) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, context=self._context, logger=self._logger) else: lambda_handler(event, self._context) ResultNotifications(context=self._context, logger=self._logger).publish_started(task_item) except Exception as ex: self._logger.error(ERR_RUNNING_TASK, task_item, str(ex), full_stack())
def is_completed(self, snapshot_create_data): def grant_create_volume_permissions(snap_ids): if self.accounts_with_create_permissions is not None and len(self.accounts_with_create_permissions) > 0: args = { "CreateVolumePermission": { "Add": [{"UserId": a.strip()} for a in self.accounts_with_create_permissions] } } for snapshot_id in snap_ids: args["SnapshotId"] = snapshot_id try: self.ec2_client.modify_snapshot_attribute_with_retries(**args) self._logger_.info(INFO_SETTING_CREATE_VOLUME_PERMISSIONS, ", ".join(self.accounts_with_create_permissions)) self.result["create-volume-access-accounts"] = [a.strip() for a in self.accounts_with_create_permissions] except Exception as ex: raise_exception(ERR_SETTING_CREATE_VOLUME_PERMISSIONS, self.accounts_with_create_permissions, ex) def tag_shared_snapshots(snapshot_data, snap_ids): if self.accounts_with_create_permissions not in ["", None] and self.tag_shared_snapshots: for account in self.accounts_with_create_permissions: session_for_tagging = self.get_action_session(account=account, param_name=PARAM_SHARED_ACCOUNT_TAGGING_ROLENAME, logger=self._logger_) if session_for_tagging is None: self._logger_.error(ERR_TAGS_NOT_SET_IN_ACCOUNT, account) continue try: ec2_client = get_client_with_retries(service_name="ec2", methods=[ "create_tags", "delete_tags" ], context=self._context_, region=self._region_, session=session_for_tagging, logger=self._logger_) for snap_id in snap_ids: tags = snapshot_data.get(snap_id, {}).get("tags", None) if tags is not None: self._logger_.info(INFO_SET_SNAPSHOT_TAGS_SHARED, safe_json(tags, indent=3), snap_id, account, self._region_) tagging.set_ec2_tags(ec2_client=ec2_client, resource_ids=[snap_id], tags=tags, logger=self._logger_) except Exception as ex: raise Exception(ERR_SETTING_SHARED_TAGS.format(account, str(ex))) def set_volume_tags(volume_id, snap_id): tags = self.build_tags_from_template(parameter_name=PARAM_VOLUME_TAGS, tag_variables={ TAG_PLACEHOLDER_VOLUME_SNAPSHOT: snap_id }) if len(tags) > 0: try: tagging.set_ec2_tags(ec2_client=self.ec2_client, resource_ids=[volume_id], tags=tags, logger=self._logger_) self._logger_.info(INFO_SET_VOLUME_TAGS, safe_json(tags, indent=3), volume_id) except Exception as ex: raise Exception(ERR_SETTING_VOLUME_TAGS.format(self.instance_id, ex)) def set_instance_tags(snap_ids): tags = self.build_tags_from_template(parameter_name=PARAM_INSTANCE_TAGS, tag_variables={ TAG_PLACEHOLDER_INSTANCE_SNAPSHOTS: ','.join(sorted(snap_ids)) }) if len(tags) > 0: try: self.set_ec2_instance_tags_with_event_loop_check(instance_ids=[self.instance_id], tags_to_set=tags, client=self.ec2_client, region=self._region_) self._logger_.info(INFO_SET_INSTANCE_TAGS, safe_json(tags, indent=3), self.instance_id) except Exception as ex: raise Exception(ERR_SETTING_INSTANCE_TAGS.format(self.instance_id, ex)) snapshot_ids = [volume.get("create_snapshot", {}).get("SnapshotId") for volume in list(snapshot_create_data.get("volumes", {}).values())] self._logger_.info(INFO_CHECKING_SNAPSHOT_STATUS, ",".join(snapshot_ids)) if len(snapshot_ids) == 0: return { "InstanceId": snapshot_create_data["instance"], "Volumes": [] } # create service instance to test is snapshots are available ec2 = services.create_service("ec2", session=self._session_, service_retry_strategy=get_default_retry_strategy("ec2", context=self._context_)) # test if the snapshot with the ids that were returned from the CreateSnapshot API call exists and are completed snapshots = list(ec2.describe(services.ec2_service.SNAPSHOTS, OwnerIds=["self"], region=self.instance["Region"], Filters=[ { "Name": "snapshot-id", "Values": snapshot_ids } ])) if len(snapshots) != len(snapshot_ids): # allow 5 minutes to all snapshots to appear start_time = dateutil.parser.parse(snapshot_create_data["start-time"]) if self._datetime_.now() - start_time < timedelta(minutes=5): self._logger_.info(INFO_NOT_ALL_IN_PROGRESS) return None test_result = { "InstanceId": snapshot_create_data["instance"], "Volumes": [{ "VolumeId": s["VolumeId"], "SnapshotId": s["SnapshotId"], "State": s["State"], "Progress": s["Progress"] } for s in snapshots] } self._logger_.info(INFO_STATE_SNAPSHOTS, safe_json(test_result, indent=3)) # wait until all snapshot are no longer pending for volume in test_result["Volumes"]: if volume["State"] == SNAPSHOT_STATE_PENDING: self._logger_.info(INFO_CREATION_PENDING) return None # collect possible failed snapshots failed = [] for volume in test_result["Volumes"]: if volume["State"] == SNAPSHOT_STATE_ERROR: failed.append(volume) if len(failed) > 0: s = ",".join([ERR_FAILED_SNAPSHOT.format(volume["SnapshotId"], volume["VolumeId"]) for volume in failed]) raise Exception(s) if len(snapshot_ids) != len(snapshots): created_snapshots = [s["SnapshotId"] for s in snapshots] raise Exception(ERR_MISSING_SNAPSHOTS.format(",".join([s for s in snapshot_ids if s not in created_snapshots]))) snapshot_ids = [s["SnapshotId"] for s in snapshots] # set tags on source instance set_instance_tags(snapshot_ids) for s in snapshots: set_volume_tags(volume_id=s["VolumeId"], snap_id=s["SnapshotId"]) # set permissions to create volumes from snapshots grant_create_volume_permissions(snapshot_ids) # tag resources in accounts the snapshots are shared with tag_shared_snapshots(snapshot_create_data.get("snapshots", {}), snapshot_ids) self._logger_.info(INFO_COMPLETED) return test_result
def run_as_ecs_job(args, ecs_memory_size, context=None, logger=None): """ Runs a teak step as ecs task :param args: ecs task parameters :param ecs_memory_size: reserved memory size for ecs task container :param context: lambda context :param logger: logger :return: result of ecs task submission """ start_time = time.time() start_task_timeout = 300 def timed_out_no_context(next_wait): return (time.time() - start_time) > (start_task_timeout - next_wait) def timed_out_by_lambda_timeout(next_wait): if context is None: return False context_seconds_left = context.get_remaining_time_in_millis() * 1000 return context_seconds_left < (5 + next_wait) runner_args = copy.deepcopy(args) ecs_client = boto_retry.get_client_with_retries("ecs", ["run_task"], context=context) stack_name = os.getenv(ENV_STACK_NAME) runner_args["stack"] = stack_name runner_args["stack_region"] = ecs_client.meta.region_name ecs_params = { "cluster": os.getenv(ENV_ECS_CLUSTER), "taskDefinition": os.getenv(ENV_ECS_TASK), "startedBy": "{}:{}".format(stack_name, args[TASK_NAME])[0:35], "overrides": { "containerOverrides": [ { "name": "ops-automator", "command": ["python", "ops-automator-ecs-runner.py", safe_json(runner_args)], "memoryReservation": int(ecs_memory_size if ecs_memory_size is not None else ECS_DEFAULT_MEMORY_RESERVATION) } ], }, } for wait_until_next_retry in boto_retry.LinearWaitStrategy(start=5, incr=5, max_wait=30, random_factor=0.50): # try to start task resp = ecs_client.run_task_with_retries(**ecs_params) # test if task was started if len(resp["tasks"]) != 0: if logger is not None: logger.info("{} executed as ECS job:{}\n", args[HANDLER_EVENT_ACTION], safe_json(resp.get("tasks", []), indent=3)) return resp # investigate failures, note that no exceptions are raised if tasks fails because of insufficient resources in cluster failures = resp.get("failures", []) # test for other failures than not enough memory resources on cluster instances # and test if there is time left for another retry based on on Lambda timeout or fixed timeout when not running in Lambda if not all([f["reason"] == "RESOURCE:MEMORY" for f in resp["failures"]]) or \ (timed_out_by_lambda_timeout(next_wait=wait_until_next_retry) or timed_out_no_context(next_wait=wait_until_next_retry)): raise_exception(ERR_FAILED_TO_START_ECS_TASK, safe_json(args), safe_json(failures, indent=3)) else: # sleep until nxt retry time.sleep(wait_until_next_retry)
def _handle_test_task_completion(self): def completion_timed_out(): """ Function is called when the handling of the request times out :return: """ time_used = int( os.getenv(handlers.ENV_LAMBDA_TIMEOUT) - self._context.get_remaining_time_in_millis() / 1000) self._logger.warning(WARN_COMPLETION_CHECK_TIMEOUT, time_used) if self.action_properties.get(actions.ACTION_COMPLETION_SIZE, None) is not None: self._logger.warning( WARN_ADJUST_LAMBDA_MEMORY_SETTINGS_COMPLETION, time_used, self.task) self._timeout_event.set() self._logger.flush() if self._timer is not None: self._timer.cancel() execution_time = int(time.time()) - self.started_at execution_time_str = str(timedelta(seconds=execution_time)) result_data = { "task": self.task, "action": self.action, "id": self.action_id, "datetime": datetime.now().isoformat(), "running-time": execution_time } if self._context is not None: execution_time_left = ( self._context.get_remaining_time_in_millis() / 1000.00) - REMAINING_COMPLETION_CHECK self._timer = threading.Timer(execution_time_left, completion_timed_out) self._timer.start() try: # make one more check for completion before testing for timeout check_result = self._action_instance.is_completed( self.start_result) finally: if self._timer is not None: self._timer.cancel() if self._timeout_event.is_set(): raise Exception("Task completion check timed out") if check_result is not None: self._action_tracking.update_task( action_id=self.action_id, task=self.task, task_metrics=self.metrics, status=handlers.STATUS_COMPLETED, status_data={ handlers.TASK_TR_RESULT: check_result, handlers.TASK_TR_EXECUTION_TIME: str(execution_time) }) self._logger.info(INF_TASK_COMPLETED, safe_json(check_result, indent=3), execution_time_str) result_data.update({ "result": handlers.STATUS_COMPLETED, "action-result": str(check_result) }) elif execution_time > self.timeout: self._action_tracking.update_task( action_id=self.action_id, task=self.task, task_metrics=self.metrics, status=handlers.STATUS_TIMED_OUT, status_data={ handlers.TASK_TR_EXECUTION_TIME: str(execution_time) }) self._logger.error(ERR_TASK_TIMEOUT, execution_time_str) result_data.update({"result": handlers.STATUS_TIMED_OUT}) return result_data else: self._logger.info(INF_ACTION_NOT_COMPLETED, execution_time_str) result_data.update({"result": handlers.STATUS_WAIT_FOR_COMPLETION}) return safe_dict(result_data)
def add_task_action(self, task, assumed_role, action_resources, task_datetime, source, task_group=None): item = { handlers.TASK_TR_ID: str(uuid.uuid4()), handlers.TASK_TR_NAME: task[handlers.TASK_NAME], handlers.TASK_TR_ACTION: task[handlers.TASK_ACTION], handlers.TASK_TR_CREATED: datetime.now().isoformat(), handlers.TASK_TR_CREATED_TS: int(time()), handlers.TASK_TR_SOURCE: source, handlers.TASK_TR_DT: task_datetime, handlers.TASK_TR_STATUS: handlers.STATUS_PENDING, handlers.TASK_TR_DEBUG: task[handlers.TASK_DEBUG], handlers.TASK_TR_NOTIFICATIONS: task[handlers.TASK_NOTIFICATIONS], handlers.TASK_TR_METRICS: task[handlers.TASK_METRICS], handlers.TASK_TR_DRYRUN: task[handlers.TASK_DRYRUN], handlers.TASK_TR_INTERNAL: task[handlers.TASK_INTERNAL], handlers.TASK_TR_INTERVAL: task[handlers.TASK_INTERVAL], handlers.TASK_TR_TIMEZONE: task[handlers.TASK_TIMEZONE], handlers.TASK_TR_TIMEOUT: task[handlers.TASK_TIMEOUT], handlers.TASK_TR_STARTED_TS: int(time()), handlers.TASK_TR_EXECUTE_SIZE: task[handlers.TASK_EXECUTE_SIZE], handlers.TASK_TR_SELECT_SIZE: task[handlers.TASK_SELECT_SIZE], handlers.TASK_TR_EVENTS: task.get(handlers.TASK_EVENTS, {}), handlers.TASK_TR_COMPLETION_SIZE: task[handlers.TASK_COMPLETION_SIZE], handlers.TASK_TR_TAGFILTER: task[handlers.TASK_TAG_FILTER], handlers.TASK_TR_GROUP: task_group, handlers.TASK_TR_SERVICE: task[handlers.TASK_SERVICE], handlers.TASK_TR_RESOURCE_TYPE: task[handlers.TASK_RESOURCE_TYPE] } item[handlers.TASK_TR_RUN_LOCAL] = self._run_local if assumed_role is not None: item[handlers.TASK_TR_ASSUMED_ROLE] = assumed_role item[handlers.TASK_TR_ACCOUNT] = services.account_from_role_arn( assumed_role) else: item[handlers.TASK_TR_ACCOUNT] = self.account if len(task[handlers.TASK_PARAMETERS]) > 0: item[handlers.TASK_TR_PARAMETERS] = task[handlers.TASK_PARAMETERS] parameters = item.get(handlers.TASK_TR_PARAMETERS, None) if parameters is not None: item[handlers.TASK_TR_PARAMETERS] = parameters # check if the class has a field or static method that returns true if the action class needs completion # this way we can make completion dependent of parameter values has_completion = getattr( actions.get_action_class(task[handlers.TASK_ACTION]), actions.ACTION_PARAM_HAS_COMPLETION, None) if has_completion is not None: # if it is static method call it passing the task parameters if isinstance(has_completion, types.FunctionType): has_completion = has_completion(parameters) else: # if it does not have this method test if the class has an us_complete method has_completion = getattr( actions.get_action_class(task[handlers.TASK_ACTION]), handlers.COMPLETION_METHOD, None) is not None item[handlers.TASK_TR_HAS_COMPLETION] = has_completion resource_data_str = safe_json(action_resources) if len(resource_data_str) < int( os.getenv(handlers.ENV_RESOURCE_TO_S3_SIZE, 16)) * 1024: item[handlers.TASK_TR_RESOURCES] = as_dynamo_safe_types( action_resources) else: bucket = os.getenv(handlers.ENV_RESOURCE_BUCKET) key = "{}.json".format(item[handlers.TASK_TR_ID]) try: self.s3_client.put_object_with_retries(Body=resource_data_str, Bucket=bucket, Key=key) except Exception as ex: raise_exception(ERR_WRITING_RESOURCES, bucket, key, item[handlers.TASK_TR_ID], ex) item[handlers.TASK_TR_S3_RESOURCES] = True self._new_action_items.append(item) return item
def tasks_items_to_execute(): """ Generator function that selects all record items from the event that need processing. :return: """ def table_name(rec): source_arn = rec["eventSourceARN"] return source_arn.split("/")[1] def from_tracking_table(rec): return table_name(rec) == os.getenv( handlers.ENV_ACTION_TRACKING_TABLE) def from_concurrency_table(rec): return table_name(rec) == os.getenv( handlers.ENV_CONCURRENCY_TABLE) def get_old_image(task_record): return task_record["dynamodb"].get("OldImage", {}) def get_new_image(task_record): return task_record["dynamodb"].get("NewImage", {}) def get_new_status(task_record): return get_new_image(task_record).get(handlers.TASK_TR_STATUS, {}).get("S") def get_old_status(task_record): return get_new_image(task_record).get(handlers.TASK_TR_STATUS, {}).get("S") def is_task_tracking_table_update(task_record): if not from_tracking_table(task_record): return False return task_record["eventName"] in ["UPDATE", "MODIFY"] def is_task_done(task_record): if not is_task_tracking_table_update(task_record): return False new_status = get_new_status(task_record) old_status = get_old_status(task_record) if old_status != new_status: return False return new_status in handlers.task_tracking_table.NOT_LONGER_ACTIVE_STATUSES def is_task_with_concurrency(task_record): return get_new_image(task_record).get( handlers.TASK_TR_CONCURRENCY_KEY, {}).get("S") is not None def get_old_last_update(task_record): return get_old_image(task_record).get( handlers.TASK_TR_LAST_WAIT_COMPLETION, {}).get("S") def get_new_last_update(task_record): return get_new_image(task_record).get( handlers.TASK_TR_LAST_WAIT_COMPLETION, {}).get("S") def is_delete_task(task_record): return from_tracking_table( r) and task_record["eventName"] == "REMOVE" def is_new_task(task_record): if from_tracking_table( r) and task_record["eventName"] == "INSERT": return get_new_status( task_record) == handlers.STATUS_PENDING return False def is_completed_with_concurrency(task_record): return is_task_done(task_record) and is_task_with_concurrency( task_record) def is_completed_without_concurrency(task_record): return is_task_done( task_record) and not is_task_with_concurrency(task_record) def is_wait_for_completion(task_record): if not is_task_tracking_table_update(task_record): return False if get_old_status(task_record) != handlers.STATUS_WAIT_FOR_COMPLETION or \ get_new_status(task_record) != handlers.STATUS_WAIT_FOR_COMPLETION: return False return get_old_last_update(task_record) != get_new_last_update( task_record) def is_concurrency_task_completed(concurrency_record): if not from_concurrency_table(concurrency_record): return False if concurrency_record["eventName"] == "REMOVE": return False return concurrency_record["dynamodb"].get("NewImage", {}).get( "RunNext", {}).get("BOOL", False) def get_action_type(rec): if is_new_task(rec): return NEW_TASK if is_completed_without_concurrency(rec): return FINISHED_TASK if is_completed_with_concurrency(rec): return FINISHED_CONCURRENCY_TASK if is_wait_for_completion(rec): return CHECK_COMPLETION if is_delete_task(rec): return DELETE_ITEM if is_concurrency_task_completed(rec): return START_WAITING_ACTION return None for r in self._event.get("Records"): self._logger.debug("Record to process is {}", safe_json(r, indent=2)) if r.get("eventSource") == "aws:dynamodb": image_used = "NewImage" if "NewImage" in r[ "dynamodb"] else "OldImage" if r["dynamodb"].get("NewImage", {}).get(handlers.TASK_TR_ACTION) is None and \ r["dynamodb"].get("OldImage", {}).get(handlers.TASK_TR_ACTION) is not None: continue self._logger.debug_enabled = r["dynamodb"][image_used].get( handlers.TASK_TR_DEBUG, {}).get("BOOL", False) update_to_handle = get_action_type(r) if update_to_handle is not None: yield update_to_handle, r else: self._logger.debug("No action for record")
def _handle_task_execution(self): def execute_timed_out(): """ Function is called when the handling of the request times out :return: """ time_used = int( int(os.getenv(handlers.ENV_LAMBDA_TIMEOUT)) - self._context.get_remaining_time_in_millis() / 1000) self._logger.error(ERR_EXECUTION_NOT_COMPLETED, time_used) if self.action_properties.get(actions.ACTION_EXECUTE_SIZE, None) is not None: self._logger.error(ERR_TIMEOUT, self.task) self._timeout_event.set() self._logger.flush() self._timer.cancel() def handle_metrics(result): self._logger.info( INF_SENDING_METRICS_DATA, "enabled" if allow_send_metrics() else "disabled") if allow_send_metrics(): try: result_data = result if isinstance( result, dict) else json.loads(result) if actions.METRICS_DATA in result_data: send_metrics_data( metrics_data=result_data[actions.METRICS_DATA], logger=self._logger) except Exception as ex: self._logger.warning(WARN_METRICS_DATA, str(ex)) self._logger.info(INF_ACTION, self.action, self.action_id, self.task, safe_json(self.action_parameters, indent=3)) if not handlers.running_local(self._context): self._logger.info(INF_LAMBDA_MEMORY, self._context.function_name, self._context.memory_limit_in_mb) self._logger.debug("Setting task state to {}", handlers.STATUS_STARTED) self._action_tracking.update_task(self.action_id, self.task, task_metrics=self.metrics, status=handlers.STATUS_STARTED) start = time.time() return_data = { "task": self.task, "action": self.action, "id": self.action_id, "dryrun": self.dryrun, } if self._context is not None: execution_time_left = ( self._context.get_remaining_time_in_millis() / 1000.00) - EXECUTE_TIME_REMAINING self._timer = threading.Timer(execution_time_left, execute_timed_out) self._timer.start() try: self._logger.debug("Start executing task") action_result = self._action_instance.execute() if isinstance(action_result, str): action_result = json.loads(action_result) finally: if self._timer is not None: self._timer.cancel() if self._timeout_event.is_set(): raise Exception("Timeout execution action") if not self._action_instance.properties.get(actions.ACTION_INTERNAL, False): handle_metrics(action_result) execution_time = int(time.time() - start) self._logger.debug("Task needs{}completion", " no" if not self.has_completion else " ") if not self.has_completion or self.dryrun: self._logger.debug("Setting state of task to {} ", handlers.STATUS_COMPLETED) self._action_tracking.update_task( action_id=self.action_id, task=self.task, task_metrics=self.metrics, status=handlers.STATUS_COMPLETED, status_data={ handlers.TASK_TR_STARTED_TS: int(start), handlers.TASK_TR_RESULT: action_result, handlers.TASK_TR_EXECUTION_TIME: str(execution_time), handlers.TASK_TR_EXECUTION_LOGSTREAM: self.execution_log_stream }) # noinspection PyBroadException try: self._logger.info(INF_ACTION_RESULT, execution_time, safe_json(action_result, indent=3)) except Exception: self._logger.info(INF_ACTION_RESULT, execution_time, str(action_result)) else: # the action has a method for testing completion of the task, set the status to waiting and store the result # of the execution that started the action as start result that will be passed to the completion method together self._logger.debug("Setting state of task to {} ", handlers.STATUS_WAIT_FOR_COMPLETION) self._action_tracking.update_task( action_id=self.action_id, task=self.task, task_metrics=self.metrics, status=handlers.STATUS_WAIT_FOR_COMPLETION, status_data={ handlers.TASK_TR_LAST_WAIT_COMPLETION: datetime.now().isoformat(), handlers.TASK_TR_STARTED_TS: int(start), handlers.TASK_TR_START_RESULT: action_result, handlers.TASK_TR_START_EXECUTION_TIME: str(execution_time), handlers.TASK_TR_EXECUTION_LOGSTREAM: self.execution_log_stream }) self._logger.info(INF_STARTED_AND_WAITING_FOR_COMPLETION, safe_json(action_result, indent=3)) if not handlers.running_local(self._context): rule = handlers.enable_completion_cloudwatch_rule( self._context) self._logger.info(INF_RULE_ENABLED, rule) else: self._logger.info(INF_SIMULATION_MODE_NO_RULE_ENABLED) # no exception from action return_data.update({ "result": handlers.STATUS_WAIT_FOR_COMPLETION if self.has_completion else handlers.STATUS_COMPLETED, "action-result": str(action_result), "datetime": datetime.now().isoformat(), "running-time": str(execution_time), "task-group": self._event[handlers.TASK_TR_GROUP], "task-id": self._event[handlers.TASK_TR_ID] }) return safe_dict(return_data)
def validate_new_instance(): replaced_state = self.ec2.get_instance( instance_id=self.replaced_instance_id)["State"]["Code"] & 0xFF if self.new_instance_id is None: if expected_new_type != self.replaced_instance[ "InstanceType"] or replace_if_same_type: self.fail("Instance was expected to be replaced") if replaced_state in [ replace_instance.EC2_STATE_TERMINATED, replace_instance.EC2_STATE_SHUTTING_DOWN, replace_instance.EC2_STATE_STOPPING ]: self.fail( "Instance was {} not replaced but is not longer running" .format(self.replaced_instance_id)) self.logger.test("[X] Instance was not replaced as expected") return True if replaced_state == replace_instance.EC2_STATE_RUNNING: self.fail("Replaced instance {} is still running".format( self.replaced_instance_id)) replaced_instance_checked_data = jmespath.search( JMESPATH_CHECKED_INSTANCE_DATA, self.replaced_instance) new_instance_checked_data = jmespath.search( JMESPATH_CHECKED_INSTANCE_DATA, self.new_instance) if replaced_instance_checked_data != new_instance_checked_data: self.fail("Instance data is {}, expected {}".format( safe_json(new_instance_checked_data, indent=3), safe_json(replaced_instance_checked_data, indent=3))) # instance id must be different if the type was not the same or replaced if same type if self.replaced_instance_id == self.new_instance_id: if not same_type or replace_if_same_type: self.fail( "InstanceId {} op new instance should not be the same as Id of replaces instance" .format(self.replaced_instance_id)) new_instance_state = self.new_instance["State"]["Code"] & 0xFF if stopped_instance: if not replace_instance.Ec2ReplaceInstanceAction.is_in_stopping_or_stopped_state( new_instance_state): self.fail( "New Instance {} is not in stopping or stopped state, {}" .format(self.new_instance_id, new_instance_state)) else: if replace_instance.EC2_STATE_RUNNING != new_instance_state: self.fail("New instance {} is not in running, {}".format( self.new_instance_id, new_instance_state)) check_load_balancer_registrations() instance_placeholders = { replace_instance.TAG_PLACEHOLDER_ORG_INSTANCE_ID: self.replaced_instance_id, replace_instance.TAG_PLACEHOLDER_NEW_INSTANCE_TYPE: self.new_instance["InstanceType"], replace_instance.TAG_PLACEHOLDER_ORG_INSTANCE_TYPE: self.start_type } new_instance_tags = self.new_instance.get("Tags", {}) if not testing.tags.verify_placeholder_tags( new_instance_tags, instance_placeholders, exclude_tags=["tag-" + tagging.TAG_VAL_TASK]) or \ new_instance_tags.get("Name") != test_method or \ new_instance_tags.get("scaling", None) is not None: self.fail("Expected tags not set on new instance") time.sleep(15) check_instance_volumes_tags() checked_type = expected_new_type if expected_new_type is not None else \ self.get_new_instance_types(self.replaced_instance_id, same_type)[0] if self.new_instance["InstanceType"] != checked_type: self.fail( "Type of instance {} is {} but expected type {}".format( self.new_instance_id, self.new_instance["InstanceType"], checked_type))
def handle_request(self, use_custom_select=True): """ Handled the cloudwatch rule timer event :return: Started tasks, if any, information """ try: self._logger.info("Handling CloudWatch event {}", safe_json(self._event, indent=3)) result = [] start = datetime.now() dt = self._event_time() config_task = None source_resource_tags = None try: # for all events tasks in configuration for config_task in TaskConfiguration( context=self._context, logger=self._logger).get_tasks(): self._logger.debug_enabled = config_task.get( handlers.TASK_DEBUG, False) if not self._event_triggers_task(task=config_task): continue # tasks that can react to events with a wider resource scope than the actual resource causing the event may # have a filter that can is used to filter based on the tags of the resource event_source_tag_filter = config_task.get( handlers.TASK_EVENT_SOURCE_TAG_FILTER, None) if event_source_tag_filter is not None: if source_resource_tags is None: # get the tags for the source resource of the event session = services.get_session( self._role_executing_triggered_task, logger=self._logger) if session is None: self._logger.error( ERR_NO_SESSION_FOR_GETTING_TAGS) continue try: source_resource_tags = self._source_resource_tags( session, config_task) except Exception as ex: self._logger.error( ERR_GETTING_EVENT_SOURCE_RESOURCE_TAGS, ex) continue self._logger.debug( "Tags for event source resource are {}", source_resource_tags) # apply filter to source resource tags if not TagFilterExpression( event_source_tag_filter).is_match( source_resource_tags): self._logger.debug( "Tags of source resource do not match tag filter {}", event_source_tag_filter) continue task_name = config_task[handlers.TASK_NAME] result.append(task_name) select_parameters = self._select_parameters( self._event_name(), config_task) if select_parameters is None: continue self._logger.debug(DEBUG_EVENT, task_name, self._event_name(), select_parameters, self._event_account(), self._event_region(), safe_json(config_task, indent=3)) # create an event for lambda function that scans for resources for this task lambda_event = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.HANDLER_EVENT_CUSTOM_SELECT: use_custom_select, handlers.HANDLER_SELECT_ARGUMENTS: { handlers.HANDLER_EVENT_REGIONS: [self._event_region()], handlers.HANDLER_EVENT_ACCOUNT: self._event_account(), handlers.HANDLER_EVENT_RESOURCE_NAME: config_task[handlers.TASK_RESOURCE_TYPE], }, handlers.HANDLER_EVENT_SOURCE: "{}:{}:{}".format(self._handled_event_source, self._handled_detail_type, self._event_name()), handlers.HANDLER_EVENT_TASK: config_task, handlers.HANDLER_EVENT_TASK_DT: dt } for i in select_parameters: lambda_event[handlers.HANDLER_SELECT_ARGUMENTS][ i] = select_parameters[i] if self._event_resources() is not None: self._logger.debug( DEBUG_EVENT_RESOURCES, safe_json(self._event_resources(), indent=3)) lambda_event[ handlers. HANDLER_SELECT_RESOURCES] = self._event_resources( ) if not handlers.running_local(self._context): # start lambda function to scan for task resources payload = str.encode(safe_json(lambda_event)) client = get_client_with_retries("lambda", ["invoke"], context=self._context, logger=self._logger) client.invoke_with_retries( FunctionName=self._context.function_name, InvocationType="Event", LogType="None", Payload=payload) else: # or if not running in lambda environment pass event to main task handler lambda_handler(lambda_event, None) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": (datetime.now() - start).total_seconds(), "event-datetime": dt, "started-tasks": result }) except ValueError as ex: self._logger.error(ERR_HANDLING_EVENT_IN_BASE_HANDLER, ex, safe_json(config_task, indent=2)) finally: self._logger.flush()
def _execute_task(self, task, dt=None, task_group=None): """ Execute a task by starting a lambda function that selects the resources for that action :param task: Task started :param dt: Task start datetime :return: """ debug_state = self._logger.debug_enabled self._logger.debug_enabled = task.get(handlers.TASK_DEBUG, False) if task_group is None: task_group = str(uuid.uuid4()) try: event = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.HANDLER_EVENT_TASK: task, handlers.HANDLER_EVENT_SOURCE: "scheduler-handler", handlers.HANDLER_EVENT_TASK_DT: dt.isoformat() if dt is not None else datetime.utcnow().isoformat(), handlers.HANDLER_EVENT_TASK_GROUP: task_group } sub_tasks = list( ScheduleHandler.task_account_region_sub_tasks(task)) for sub_task in sub_tasks: event[handlers.HANDLER_EVENT_SUB_TASK] = sub_task if not handlers.running_local(self._context): if task[handlers. TASK_SELECT_SIZE] != actions.ACTION_USE_ECS: # start lambda function to scan for task resources payload = str.encode(safe_json(event)) client = get_client_with_retries("lambda", ["invoke"], context=self._context) function_name = "{}-{}-{}".format( os.getenv(handlers.ENV_STACK_NAME), os.getenv(handlers.ENV_LAMBDA_NAME), task[handlers.TASK_SELECT_SIZE]) self._logger.info(INFO_RUNNING_LAMBDA, function_name) try: resp = client.invoke_with_retries( FunctionName=function_name, InvocationType="Event", LogType="None", Payload=payload) self._logger.debug(DEBUG_LAMBDA, resp["StatusCode"], payload) except Exception as ex: self._logger.error(ERR_FAILED_START_LAMBDA_TASK, str(ex)) else: ecs_args = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.TASK_NAME: task[handlers.TASK_NAME], handlers.HANDLER_EVENT_SUB_TASK: sub_task } ecs_memory = task.get(handlers.TASK_SELECT_ECS_MEMORY, None) self._logger.info(INFO_RUNNING_AS_ECS_JOB, task[handlers.TASK_NAME]) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, context=self._context, logger=self._logger) else: if task[handlers. TASK_SELECT_SIZE] == actions.ACTION_USE_ECS: ecs_args = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.TASK_NAME: task[handlers.TASK_NAME], handlers.HANDLER_EVENT_SUB_TASK: sub_task } ecs_memory = task.get(handlers.TASK_SELECT_ECS_MEMORY, None) handlers.run_as_ecs_job(ecs_args, ecs_memory_size=ecs_memory, logger=self._logger) else: # or if not running in lambda environment pass event to main task handler lambda_handler(event, self._context) return task_group, sub_tasks finally: self._logger.debug_enabled = debug_state
def handle_scheduler_tasks(self, task_config): started_tasks = {} start = datetime.now() last_run_dt = self._get_last_run() self._logger.info(INFO_LAST_SAVED, last_run_dt.isoformat()) if self.configuration_update: self._logger.info(INFO_CONFIG_RUN, self.updated_task) current_dt = self._set_last_run() already_ran_this_minute = last_run_dt == current_dt if already_ran_this_minute and not (self.configuration_update or self.execute_task_request): self._logger.info(INFO_TASK_SCHEDULER_ALREADY_RAN) else: self._logger.info(INFO_CURRENT_SCHEDULING_DT, current_dt) task = None enabled_tasks = 0 next_executed_task = None utc = pytz.timezone("UTC") tasks = [ t for t in task_config.get_tasks() if t.get(handlers.TASK_INTERVAL) is not None and t.get(handlers.TASK_ENABLED, True) ] try: for task in tasks: enabled_tasks += 1 self._logger.debug_enabled = task[handlers.TASK_DEBUG] task_name = task[handlers.TASK_NAME] # timezone for specific task task_timezone = pytz.timezone(task[handlers.TASK_TIMEZONE]) # create cron expression to test if task needs te be executed task_cron_expression = CronExpression( expression=task[handlers.TASK_INTERVAL]) localized_last_run = last_run_dt.astimezone(task_timezone) localized_current_dt = current_dt.astimezone(task_timezone) next_execution = task_cron_expression.first_within_next( timedelta(hours=24), localized_current_dt) next_execution_utc = next_execution.astimezone( utc).replace(microsecond=0 ) if next_execution is not None else None if next_execution_utc is not None: if next_executed_task is None or next_execution_utc < next_executed_task[ 0]: next_executed_task = (next_execution_utc, task) if already_ran_this_minute: continue # test if task needs te be executed since last run of ops automator execute_dt_since_last = task_cron_expression.last_since( localized_last_run, localized_current_dt) if execute_dt_since_last is None: if next_execution is not None: next_execution = next_execution.astimezone( task_timezone) self._logger.info(INFO_NEXT_EXECUTION, task_name, next_execution.isoformat(), task_timezone) else: self._logger.info(INFO_NO_NEXT_WITHIN, task_name) continue self._logger.info(INFO_SCHEDULED_TASK, task_name, execute_dt_since_last, task_timezone, str(safe_json(task, indent=2))) # create an event for lambda function that starts execution by selecting for resources for this task task_group, sub_tasks = self._execute_task( task, execute_dt_since_last) started_tasks[task_name] = { "task-group": task_group, "sub-tasks": sub_tasks } if started_tasks: self._logger.info(INFO_STARTED_TASKS, enabled_tasks, ",".join(started_tasks)) else: self._logger.info(INFO_NO_TASKS_STARTED, enabled_tasks) self._set_next_schedule_event(current_dt, next_executed_task) running_time = float((datetime.now() - start).total_seconds()) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "event-datetime": current_dt.isoformat(), "enabled_tasks": enabled_tasks, "started-tasks": started_tasks }) except ValueError as ex: self._logger.error(ERR_SCHEDULE_HANDLER, ex, safe_json(task, indent=2))
def ecs_handler(args): dt = datetime.utcnow() log_stream = LOG_STREAM.format("OpsAutomatorMainEcs", dt.year, dt.month, dt.day) with outputs.queued_logger.QueuedLogger(logstream=log_stream, context=None, buffersize=20) as logger: action_step = args.get(handlers.HANDLER_EVENT_ACTION, None) if action_step is None: logger.error(ERR_ECS_NO_PARAM, handlers.HANDLER_EVENT_ACTION, safe_json(args, indent=3)) return event = {} task_item = {} if action_step in [ handlers.HANDLER_ACTION_EXECUTE, handlers.HANDLER_ACTION_TEST_COMPLETION ]: task_id = args.get(handlers.TASK_TR_ID, None) if task_id is None: logger.error(ERR_ECS_NO_TASK_ID, handlers.TASK_TR_ID, safe_json(args, indent=3)) return expected_status = handlers.STATUS_PENDING \ if action_step == handlers.HANDLER_ACTION_EXECUTE \ else handlers.STATUS_WAIT_FOR_COMPLETION task_item = handlers.task_tracking_table.TaskTrackingTable( logger=logger, context=EcsTaskContext(timeout_seconds=300)).get_task_item( task_id, status=expected_status) if task_item is None: logger.error(ECS_TASK_NOT_FOUND_FOR_STEP, task_id, expected_status, action_step) return event = {i: task_item.get(i) for i in task_item} event[handlers.HANDLER_EVENT_ACTION] = action_step elif action_step in [handlers.HANDLER_ACTION_SELECT_RESOURCES]: task_name = actions.ACTION_ID = args.get(handlers.TASK_NAME, None) if task_name is None: logger.error(ERR_ECS_NO_TASK_NAME, configuration.CONFIG_TASK_NAME, safe_json(args, indent=3)) return task_item = configuration.task_configuration.TaskConfiguration( logger=logger, context=None).get_task(task_name) if task_item is None: logger.error(ECS_TASK_DOES_NOT_EXIST, task_name) return event = { handlers.HANDLER_EVENT_ACTION: handlers.HANDLER_ACTION_SELECT_RESOURCES, handlers.HANDLER_EVENT_TASK: task_item, handlers.HANDLER_EVENT_SOURCE: "ecs_handler", handlers.HANDLER_EVENT_TASK_DT: datetime.now().isoformat() } return lambda_handler( event=event, context=EcsTaskContext( timeout_seconds=task_item.get(handlers.TASK_TIMEOUT, 3600)))
def handle_request(self): """ Handles the event :return: result of handling the event, result send back to REST admin api """ def snake_to_pascal_case(s): converted = "" s = s.strip("_").capitalize() i = 0 while i < len(s): if s[i] == "-": pass elif s[i] == "_": i += 1 converted += s[i].upper() else: converted += s[i] i += 1 return converted def dict_to_pascal_case(d): ps = {} if isinstance(d, dict): for i in d: key = snake_to_pascal_case(i) ps[key] = dict_to_pascal_case(d[i]) return ps elif isinstance(d, list): return [dict_to_pascal_case(l) for l in d] return d try: self._logger.info("Handler {} : Received CLI request \n{}", self.__class__.__name__, safe_json(self._event, indent=3)) # get access to admin api module admin_module = configuration.task_admin_api # get api action and map it to a function in the admin API fn_name = self.commands.get(self.action, None) if fn_name is None: raise ValueError("Command {} does not exist".format( self.action)) fn = getattr(admin_module, fn_name) # calling the mapped admin api method self._logger.info("Calling \"{}\" with parameters \n{}", fn.__name__, safe_json(self.parameters, indent=3)) args = self.parameters args["context"] = self._context api_result = fn(**args) # convert to awscli PascalCase output format result = dict_to_pascal_case(api_result) # perform output transformation if fn_name in self.result_transformations: result = jmespath.search(self.result_transformations[fn_name], result) for t in self.attribute_transformations: if t in result: if self.attribute_transformations[t] is not None: result[self.attribute_transformations[t]] = result[t] del result[t] # log formatted result json_result = safe_json(result, 3) self._logger.info("Call result is {}", json_result) return result except Exception as ex: self._logger.info("Call failed, error is {}", str(ex)) return {"Error": str(ex)} finally: self._logger.flush()
def run(self, parameters, complete_check_polling_interval=60, task_timeout=None, task_name=None, datetime_delta=None, events=None, tag_filter=None, run_in_regions=None, action_select_params=None, debug=False, run_after_select=None): self.results = [] self.executed_tasks = [] self.parameters = parameters self.action_select_parameters = action_select_params if action_select_params is not None else {} self.task_name = task_name if task_name is not None else "{}-test".format( self.action_name).lower() self._ensure_action_stack() self.run_after_select = run_after_select self.context = Context() self._events = events if events is not None else {} self._tag_filter = tag_filter save_debug = self.debug self.debug = debug self.logger._debug = self.debug self.interval = parameters.get(actions.ACTION_PARAM_INTERVAL, None) self.run_in_regions = run_in_regions if run_in_regions is not None else [ self.test_region ] try: if datetime_delta is not None: set_datetime_delta(datetime_delta) actions.set_date_time_provider(DatetimeProvider) self.logger.test( "Setting simulated test execution date and time to {}", actions.date_time_provider().now()) for executed_task in self._get_tasks_to_execute(): try: self.executed_tasks.append(executed_task) self.logger.test( "Start execution of action {} using assumed role {} in region {}", self.action_name, self._assumed_role, self.tested_region) start_result = executed_task.execute() if not executed_task.get( actions.ACTION_PARAM_HAS_COMPLETION, False): self.logger.test("Action completed with result {}", safe_json(start_result, indent=3)) setattr(executed_task, handlers.TASK_TR_RESULT, start_result) setattr(executed_task, handlers.TASK_TR_STATUS, handlers.STATUS_COMPLETED) else: self.logger.test("Waiting for task to complete") setattr(executed_task, handlers.TASK_TR_START_RESULT, start_result) # noinspection PyProtectedMember timeout = executed_task._timeout_ if timeout is None: timeout = task_timeout * 60 if task_timeout is not None else 60 timeout *= 60 with Timer(timeout_seconds=timeout) as timer: while True: is_completed = getattr( executed_task, handlers.COMPLETION_METHOD, None) if is_completed is None: raise Exception( "Tested action needs completion but does not implement the required {} method", handlers.COMPLETION_METHOD) complete_result = executed_task.is_completed( start_result) if complete_result is not None: self.logger.test( "Action completed with result {}", safe_json(complete_result, indent=3)) setattr(executed_task, handlers.TASK_TR_STATUS, handlers.STATUS_COMPLETED) setattr(executed_task, handlers.TASK_TR_RESULT, complete_result) break if timer.timeout: self.logger.test("Action timed out") setattr(executed_task, handlers.TASK_TR_STATUS, handlers.STATUS_TIMED_OUT) setattr(executed_task, handlers.TASK_TR_ERROR, "Timeout") break self.logger.test( "Action not completed yet, waiting {} seconds", complete_check_polling_interval) time.sleep(complete_check_polling_interval) self.results.append(executed_task) except Exception as ex: self.logger.test("Action failed {}", str(ex)) setattr(executed_task, handlers.TASK_TR_STATUS, handlers.STATUS_FAILED) setattr(executed_task, handlers.TASK_TR_ERROR, str(ex)) self.results.append(executed_task) finally: if datetime_delta is not None: actions.reset_date_provider() self.debug = save_debug self.logger._debug = self.debug return self.results