def grant_create_volume_permissions(snap_ids): if self.accounts_with_create_permissions is not None and len( self.accounts_with_create_permissions) > 0: args = { "CreateVolumePermission": { "Add": [{ "UserId": a.strip() } for a in self.accounts_with_create_permissions] } } for snapshot_id in snap_ids: args["SnapshotId"] = snapshot_id try: self.ec2_client.modify_snapshot_attribute_with_retries( **args) self._logger_.info( INFO_SETTING_CREATE_VOLUME_PERMISSIONS, ", ".join(self.accounts_with_create_permissions)) self.result["create-volume-access-accounts"] = [ a.strip() for a in self.accounts_with_create_permissions ] except Exception as ex: raise_exception(ERR_SETTING_CREATE_VOLUME_PERMISSIONS, self.accounts_with_create_permissions, ex)
def set_tags_on_resized_instance(new_instance_type, original_type): # tags set by action tags = self.build_tags_from_template( parameter_name=PARAM_RESIZED_INSTANCE_TAGS, tag_variables={ TAG_PLACEHOLDER_NEW_INSTANCE_TYPE: new_instance_type, TAG_PLACEHOLDER_ORG_INSTANCE_TYPE: original_type }) try: # if task is triggered by tagging event if task_is_triggered_by_tag_event(): # up or down tags filters should not match new tags as it would re-trigger execution of the task if self.resize_mode == RESIZE_BY_STEP: for t in list(tags.keys()): # remove tags that match up or down tag filters if (self.scale_up_tagfilter and t in self.scale_up_tagfilter.get_filter_keys()) or \ (self.scale_down_tagfilter and t in self.scale_down_tagfilter.get_filter_keys()): self._logger_.info( INF_TAGS_NOT_SET_STEP.format( {t: tags[t]}, self.instance_id)) del tags[t] tags.update(tags_to_delete()) self.set_ec2_instance_tags_with_event_loop_check( client=self.ec2_client, instance_ids=[self.instance_id], tags_to_set=tags) except Exception as tag_ex: raise_exception(ERR_SET_TAGS, self.instance_id, tag_ex)
def _update_request(self): self._logger.info("Updating Task resource") name = self.resource_properties.get(CONFIG_TASK_NAME) try: if name is None: raise_exception(ERR_NO_TASK_NAME_RESOURCE_PROPERTY) if name != self.physical_resource_id: self._logger.info( "Name change for resource with physical resource id {}, new value is {}", name, self.physical_resource_id) self.arguments[CONFIG_TASK_NAME] = name create_task(**self.arguments) self.physical_resource_id = name self._logger.info( "Created new resource with physical resource id {}", self.physical_resource_id) else: update_task(name, **self.arguments) self._logger.info( "Updated resource with physical resource id {}", self.physical_resource_id) return True except Exception as ex: self.response["Reason"] = str(ex) self._logger.error(ERR_UPDATING_TASK, name, ex) return False
def __init__(self, action_arguments, action_parameters): ActionBase.__init__(self, action_arguments, action_parameters) self.config_table = os.getenv(configuration.ENV_CONFIG_TABLE, None) self.debug = self.get(actions.ACTION_PARAM_DEBUG, False) if self.config_table is None: raise_exception(ERR_ENVIRONMENT_CONFIG_VARIABLE_, configuration.ENV_CONFIG_TABLE) self.S3Bucket = self.get(PARAM_S3_BUCKET) self.S3Prefix = self.get(PARAM_S3_PREFIX)
def __init__(self, action_arguments, action_parameters): ActionBase.__init__(self, action_arguments, action_parameters) self.task_table = os.getenv(handlers.ENV_ACTION_TRACKING_TABLE, None) if self.task_table is None: raise_exception(ERR_NO_ENVIRONMENT_VARIABLE_, handlers.ENV_ACTION_TRACKING_TABLE) self.dryrun = self.get(actions.ACTION_PARAM_DRYRUN, False) self.debug = self.get(actions.ACTION_PARAM_DEBUG, False) self.S3Bucket = self.get(PARAM_S3_BUCKET) self.S3Prefix = self.get(PARAM_S3_PREFIX, "")
def _get_cloudwatch_rule(name, client): """ Get the CloudWatch event rule with the name prefix that is the stack name + name in the current stack :param name: part of the name that is added to the stack name to build the name prefix of the cloudwatch rule :param client: CloudWatch client :return: CloudWatch rules """ resp = client.list_rules_with_retries(NamePrefix=name) rules = resp.get("Rules", []) if len(rules) != 1: raise_exception(ERR_EVENT_RULE_NOT_FOUND, name, getenv(ENV_STACK_NAME)) return rules[0]
def get(self, service_resource, region=None, tags_as_dict=None, tags=False, as_tuple=None, select_on_tag=None, select=None, tag_roles=None, **describe_args): """ Alternative for describe method in cases where only a single specific resource is expected. An exception is raised when multiple resources are returned from the service :param select_on_tag: Get only if resource has this tag :param service_resource: Name of the service resource, not case sensitive, use camel or snake case :param region: Region from where resources are retrieved, if None then the current region is used :param tags: Set to True to return tags with the resource :param tags_as_dict: Set to True to return tags as python dictionaries :param as_tuple: Set to true to return results as immutable named dictionaries instead of dictionaries :param select: JMES path to select resources and select/transform attributes of returned resources :param tag_roles: optional role used to assume to select tags for a resource as this may be required by shared resources from another account :param describe_args: Parameters passed to the boto "describe" function :return: Service resource of the specified resource type for the service, None if the resource was not available. """ # get resources results = self.describe(service_resource=service_resource, region=region, tags=tags, tags_as_dict=tags_as_dict, as_tuple=as_tuple, select=select, select_on_tag=select_on_tag, tag_roles=tag_roles, **describe_args) try: # get the first returned resource result = results.next() try: # if there is more than one result, raise Exception results.next() raise_exception(ERR_UNEXPECTED_MULTIPLE_RESULTS) except StopIteration: # Expected exception as there should be only one result return result except StopIteration: return None
def __init__(self, action_arguments, action_parameters): ActionBase.__init__(self, action_arguments, action_parameters) self.task_table = os.getenv(handlers.ENV_ACTION_TRACKING_TABLE, None) if self.task_table is None: raise_exception(ERR_MISSING_ENVIRONMENT_VARIABLE_, handlers.ENV_ACTION_TRACKING_TABLE) # adding 48 hours as TTL is used in V2 as primary mechanism to delete items self.task_retention_seconds = (int(self.get(PARAM_TASK_RETENTION_HOURS)) + 48) * 3600 self.retain_failed_tasks = self.get(PARAM_RETAIN_FAILED_TASKS, True) self.dryrun = self.get(ACTION_PARAM_DRYRUN, False) self.debug = self.get(ACTION_PARAM_DEBUG, False) self._client = None
def _stop_instance(self): try: self._logger_.info(INF_STOPPING, self.instance_id) self.ec2_client.stop_instances_with_retries( InstanceIds=[self.instance_id]) except Exception as ex: raise_exception(ERR_STOP_RESIZING, self.instance_id, ex) # wait for instance to stop, or until is signaled it is about to timeout while not self.time_out(): time.sleep(10) state = self._get_instance()["StateCode"] & 0xFF if state == EC2_STATE_STOPPED: break if self.time_out(): raise_exception(ERR_INSTANCE_STOP_TIMEOUT, self.instance_id)
def tag_shared_source_snapshot(copy_id): # created tags for snapshots for shared snapshots in the source account of the shares snapshots snapshot_tags = source_tags(copy_id, PARAM_SOURCE_SHARED_BY_TAGS) if len(snapshot_tags ) == 0 or not self.tag_snapshots_in_source_account: return # only for snapshots that have been shared by other account if self.owner == self.get_account_for_task(): self._logger_.debug( "Account {} is owner, no tags set for snapshot {} in account of owner", self._account_, self.source_snapshot_id) return session_for_tagging = self.get_action_session( account=self.owner, param_name=PARAM_SOURCE_ACCOUNT_TAG_ROLE_NAME, logger=self._logger_) if session_for_tagging is None: self._logger_.error(ERR_TAGS_NOT_SET_IN_ACCOUNT, self.owner) return try: self._logger_.info(INF_CREATE_SHARED_ACCOUNT_SNAPSHOT_TAGS, snapshot_tags, self.source_snapshot_id, self.owner) ec2_client = get_client_with_retries( service_name="ec2", methods=["create_tags", "delete_tags"], context=self._context_, region=self.source_region, session=session_for_tagging, logger=self._logger_) tagging.set_ec2_tags(ec2_client=ec2_client, resource_ids=[self.source_snapshot_id], tags=snapshot_tags, logger=self._logger_) except Exception as ex: raise_exception(ERR_SETTING_SOURCE_SHARED_TAGS, self.owner, str(ex))
def _restart_instance(self): # for testing the parameter PARAM_TEST_UNAVAILABLE_TYPES can be used to simulate a InsufficientInstanceCapacity self._test_simulate_insufficient_instance_capacity() self.ec2_client.start_instances_with_retries( InstanceIds=[self.instance_id]) with Timer(timeout_seconds=60, start=True) as t: started_instance = self._get_instance() # get state of started instance current_state = started_instance["StateCode"] if self.is_in_starting_or_running_state(current_state): # instance is starting return else: if t.timeout: self._logger_.info(ERR_INSTANCE_NOT_IN_STARTING_STATE, self.instance_id, current_state) raise_exception(ERR_INSTANCE_NOT_IN_STARTING_STATE, self.instance_id, current_state)
def action_resources(self): if self._action_resources is None: if not self._event.get(handlers.TASK_TR_S3_RESOURCES, False): self._action_resources = handlers.get_item_resource_data( self._event, self._context) else: bucket = os.getenv(handlers.ENV_RESOURCE_BUCKET) key = self.action_id + ".json" try: resp = self.s3_client.get_object_with_retries( Bucket=bucket, Key=key) self._event[handlers.TASK_TR_RESOURCES] = resp[ "Body"].read().decode('utf-8') self._action_resources = handlers.get_item_resource_data( self._event, self._context) except Exception as ex: raise_exception(ERR_READING_S3_RESOURCES, bucket, key, self.action_id, ex) return self._action_resources
def tag_shared_snapshots(tags, snap_id): # creates tags for snapshots that have been shared in account the snapshots are shared with if len(tags) == 0 or not self.tag_snapshots_in_shared_accounts: return if self.accounts_with_create_permissions in ["", None]: return for account in self.accounts_with_create_permissions: session_for_tagging = self.get_action_session( account=account, param_name=PARAM_DESTINATION_ACCOUNT_TAG_ROLENAME, logger=self._logger_) if session_for_tagging is None: self._logger_.error(ERR_TAGS_NOT_SET_IN_ACCOUNT, account) continue try: ec2_client = get_client_with_retries( service_name="ec2", methods=["create_tags", "delete_tags"], context=self._context_, region=self.get(PARAM_DESTINATION_REGION), session=session_for_tagging, logger=self._logger_) tagging.set_ec2_tags(ec2_client=ec2_client, resource_ids=[snap_id], tags=tags, logger=self._logger_) self._logger_.info(INF_CREATE_SHARED_TAGS, tags, account) except Exception as ex: raise_exception(ERR_SETTING_SHARED_TAGS, account, str(ex))
def grant_create_volume_permissions(snap_id): if self.accounts_with_create_permissions is not None and len( self.accounts_with_create_permissions) > 0: args = { "CreateVolumePermission": { "Add": [{ "UserId": a.strip() } for a in self.accounts_with_create_permissions] }, "SnapshotId": snap_id } try: self.ec2_destination_client.modify_snapshot_attribute_with_retries( **args) self._logger_.info( INF_SETTING_CREATE_VOLUME_PERMISSIONS, ", ".join(self.accounts_with_create_permissions)) except Exception as ex: raise_exception(ERR_SETTING_CREATE_VOLUME_PERMISSIONS, self.accounts_with_create_permissions, ex)
def _send_response(self): """ Send the response to cloudformation provided url :return: """ # Build the PUT request and the response data resp = json.dumps(self.response) headers = {'content-type': '', 'content-length': str(len(resp))} # PUT request to cloudformation provided S3 url try: response = requests.put(self.response_url, data=json.dumps(self.response), headers=headers) response.raise_for_status() return { "status_code: {}".format(response.status_code), "status_message: {}".format(response.text) } except Exception as exc: raise_exception(ERR_SEND_RESP, self.stack_id, str(exc), self.response_url, resp)
def execute(self): def should_resize_instance(): if self.original_type == self.new_instance_type: self._logger_.info(INF_INSTANCE_NOT_RESIZED, self.instance_id, self.original_type) self.result["not-resized"] = True self.result[METRICS_DATA] = build_action_metrics( action=self, ReplacedInstances=0) return False return True self._logger_.info("{}, version {}", self.properties[ACTION_TITLE], self.properties[ACTION_VERSION]) # get instance in it's current state instance = self._get_instance() if instance is None: raise_exception(ERR_NOT_LONGER_AVAILABLE, self.instance_id) instance_running = not self.is_in_stopping_or_stopped_state( instance["StateCode"]) self.result["instance-running"] = instance_running self.original_type = instance["InstanceType"] self.result["org-instance-type"] = self.original_type self._set_new_instance_type() if not should_resize_instance(): self.result["new-instance-type"] = self.new_instance_type return self.result self._logger_.info(INF_INSTANCE_RESIZE_ACTION, self.instance_id, self.original_type, self.new_instance_type, self._task_) # instance is running, stop it first so it can be resized if instance_running: self._stop_instance() self._resize_instance() if instance_running: while True: try: self._restart_instance() break except ClientError as ex: # no capacity for this type if self.insufficient_capacity(ex): # try to set alternative type self._logger_.warning(WARN_NO_TYPE_CAPACITY, self.new_instance_type) self._set_new_instance_type() if not should_resize_instance(): # resize to original type self._resize_instance() self._restart_instance() self.result[ "new-instance-type"] = self.new_instance_type return self.result self._resize_instance() self._logger_.info(INF_RETRY_START, self.instance_id, self.new_instance_type) except Exception as ex: self.new_instance_type = self.original_type self._resize_instance() self._restart_instance() raise_exception(ERR_STARTING, self.instance_id, str(ex)) self.result[METRICS_DATA] = build_action_metrics( action=self, ResizedInstances=1, OrgInstanceSize=self.original_type, NewInstanceSize=self.new_instance_type) self.result["new-instance-type"] = self.new_instance_type return self.result
def run_as_ecs_job(args, ecs_memory_size, context=None, logger=None): """ Runs a teak step as ecs task :param args: ecs task parameters :param ecs_memory_size: reserved memory size for ecs task container :param context: lambda context :param logger: logger :return: result of ecs task submission """ start_time = time.time() start_task_timeout = 300 def timed_out_no_context(next_wait): return (time.time() - start_time) > (start_task_timeout - next_wait) def timed_out_by_lambda_timeout(next_wait): if context is None: return False context_seconds_left = context.get_remaining_time_in_millis() * 1000 return context_seconds_left < (5 + next_wait) runner_args = copy.deepcopy(args) ecs_client = boto_retry.get_client_with_retries("ecs", ["run_task"], context=context) stack_name = os.getenv(ENV_STACK_NAME) runner_args["stack"] = stack_name runner_args["stack_region"] = ecs_client.meta.region_name ecs_params = { "cluster": os.getenv(ENV_ECS_CLUSTER), "taskDefinition": os.getenv(ENV_ECS_TASK), "startedBy": "{}:{}".format(stack_name, args[TASK_NAME])[0:35], "overrides": { "containerOverrides": [{ "name": "ops-automator", "command": [ "python", "ops-automator-ecs-runner.py", safe_json(runner_args) ], "memoryReservation": int(ecs_memory_size if ecs_memory_size is not None else ECS_DEFAULT_MEMORY_RESERVATION) }], }, } for wait_until_next_retry in boto_retry.LinearWaitStrategy( start=5, incr=5, max_wait=30, random_factor=0.50): # try to start task resp = ecs_client.run_task_with_retries(**ecs_params) # test if task was started if len(resp["tasks"]) != 0: if logger is not None: logger.info("{} executed as ECS job:{}\n", args[HANDLER_EVENT_ACTION], safe_json(resp.get("tasks", []), indent=3)) return resp # investigate failures, note that no exceptions are raised if tasks fails because of insufficient resources in cluster failures = resp.get("failures", []) # test for other failures than not enough memory resources on cluster instances # and test if there is time left for another retry based on on Lambda timeout or fixed timeout when not running in Lambda if not all([f["reason"] == "RESOURCE:MEMORY" for f in resp["failures"]]) or \ (timed_out_by_lambda_timeout(next_wait=wait_until_next_retry) or timed_out_no_context(next_wait=wait_until_next_retry)): raise_exception(ERR_FAILED_TO_START_ECS_TASK, safe_json(args), safe_json(failures, indent=3)) else: # sleep until nxt retry time.sleep(wait_until_next_retry)
def add_task_action(self, task, assumed_role, action_resources, task_datetime, source, task_group=None): item = { handlers.TASK_TR_ID: str(uuid.uuid4()), handlers.TASK_TR_NAME: task[handlers.TASK_NAME], handlers.TASK_TR_ACTION: task[handlers.TASK_ACTION], handlers.TASK_TR_CREATED: datetime.now().isoformat(), handlers.TASK_TR_CREATED_TS: int(time()), handlers.TASK_TR_SOURCE: source, handlers.TASK_TR_DT: task_datetime, handlers.TASK_TR_STATUS: handlers.STATUS_PENDING, handlers.TASK_TR_DEBUG: task[handlers.TASK_DEBUG], handlers.TASK_TR_NOTIFICATIONS: task[handlers.TASK_NOTIFICATIONS], handlers.TASK_TR_METRICS: task[handlers.TASK_METRICS], handlers.TASK_TR_DRYRUN: task[handlers.TASK_DRYRUN], handlers.TASK_TR_INTERNAL: task[handlers.TASK_INTERNAL], handlers.TASK_TR_INTERVAL: task[handlers.TASK_INTERVAL], handlers.TASK_TR_TIMEZONE: task[handlers.TASK_TIMEZONE], handlers.TASK_TR_TIMEOUT: task[handlers.TASK_TIMEOUT], handlers.TASK_TR_STARTED_TS: int(time()), handlers.TASK_TR_EXECUTE_SIZE: task[handlers.TASK_EXECUTE_SIZE], handlers.TASK_TR_SELECT_SIZE: task[handlers.TASK_SELECT_SIZE], handlers.TASK_TR_EVENTS: task.get(handlers.TASK_EVENTS, {}), handlers.TASK_TR_COMPLETION_SIZE: task[handlers.TASK_COMPLETION_SIZE], handlers.TASK_TR_TAGFILTER: task[handlers.TASK_TAG_FILTER], handlers.TASK_TR_GROUP: task_group, handlers.TASK_TR_SERVICE: task[handlers.TASK_SERVICE], handlers.TASK_TR_RESOURCE_TYPE: task[handlers.TASK_RESOURCE_TYPE] } item[handlers.TASK_TR_RUN_LOCAL] = self._run_local if assumed_role is not None: item[handlers.TASK_TR_ASSUMED_ROLE] = assumed_role item[handlers.TASK_TR_ACCOUNT] = services.account_from_role_arn(assumed_role) else: item[handlers.TASK_TR_ACCOUNT] = self.account if len(task[handlers.TASK_PARAMETERS]) > 0: item[handlers.TASK_TR_PARAMETERS] = task[handlers.TASK_PARAMETERS] parameters = item.get(handlers.TASK_TR_PARAMETERS, None) if parameters is not None: item[handlers.TASK_TR_PARAMETERS] = parameters # check if the class has a field or static method that returns true if the action class needs completion # this way we can make completion dependent of parameter values has_completion = getattr(actions.get_action_class(task[handlers.TASK_ACTION]), actions.ACTION_PARAM_HAS_COMPLETION, None) if has_completion is not None: # if it is static method call it passing the task parameters if isinstance(has_completion, types.FunctionType): has_completion = has_completion(parameters) else: # if it does not have this method test if the class has an us_complete method has_completion = getattr(actions.get_action_class(task[handlers.TASK_ACTION]), handlers.COMPLETION_METHOD, None) is not None item[handlers.TASK_TR_HAS_COMPLETION] = has_completion resource_data_str = safe_json(action_resources) encrypted = self._resource_encryption_key not in [None, ""] item[handlers.TASK_TR_ENCRYPTED_RESOURCES] = encrypted if encrypted: resource_data_str = base64.b64encode(self.kms_client.encrypt_with_retries( KeyId=self._resource_encryption_key, Plaintext=resource_data_str)["CiphertextBlob"]) if len(resource_data_str) < int(os.getenv(handlers.ENV_RESOURCE_TO_S3_SIZE, 16)) * 1024: if encrypted: item[handlers.TASK_TR_RESOURCES] = action_resources if not encrypted else resource_data_str else: item[handlers.TASK_TR_RESOURCES] = as_dynamo_safe_types(action_resources) else: bucket = os.getenv(handlers.ENV_RESOURCE_BUCKET) key = "{}.json".format(item[handlers.TASK_TR_ID]) try: self.s3_client.put_object_with_retries(Body=resource_data_str, Bucket=bucket, Key=key) except Exception as ex: raise_exception(ERR_WRITING_RESOURCES, bucket, key, item[handlers.TASK_TR_ID], ex) item[handlers.TASK_TR_S3_RESOURCES] = True self._new_action_items.append(item) return item
def _get_tags_for_resource(self, client, resource): """ Returns the tags for specific resources that require additional boto calls to retrieve their tags. :param client: Client that can be used to make the boto call to retrieve the tags :param resource: The resource for which to retrieve the tags :return: Tags """ # get the name of the proprty that holds the arn of the resource arn_property_name = "{}Arn".format(self._resource_name[0:-1]) if arn_property_name[0:2].lower() == "db": arn_property_name = "DB{}".format(arn_property_name[2:]) # get the arn of the resource resource_arn = resource[arn_property_name] # owner of the resource (could be other account for shared sbapshots) resource_owner_account = resource_arn.split(":")[4] resource_region = resource_arn.split(":")[3] if resource_owner_account == self.aws_account: # sane account, can use same session as used to retrieve the resource if self._use_cached_tags: self._tag_session = self.session # make sure the client has retries if getattr(self._service_client, "list_tags_for_resource" + boto_retry.DEFAULT_SUFFIX, None) is None: boto_retry.make_method_with_retries(boto_client_or_resource=client, name="list_tags_for_resource", service_retry_strategy=self._service_retry_strategy) self._tag_rds_client = client else: # resource is from other account, get a session to get the tags from that account as these are not # visible for shared rds resources if self._tag_account != resource_owner_account or self._tag_session is None: self._tag_account = resource_owner_account used_tag_role = None if self._tag_roles is not None: # see if there is a role for the owner account for role in self._tag_roles: if role is not None and services.account_from_role_arn(role) == resource_owner_account: used_tag_role = role break else: # if there is no role and the account is the ops automator account use the default role # in other cases it is not possible to retrieve the tags if resource_owner_account != os.getenv(handlers.ENV_OPS_AUTOMATOR_ACCOUNT): return {} self._tag_session = services.get_session(role_arn=used_tag_role) if not self._use_cached_tags: self._tag_rds_client = boto_retry.get_client_with_retries("rds", methods=["list_tags_for_resource"], context=self._context, region=resource_region) if self._use_cached_tags: return self.cached_tags(session=self._tag_session, resource_name=RESOURCES_WITH_TAGS[resource["ResourceTypeName"]], region=resource_region).get(resource_arn, {}) try: resp = self._tag_rds_client.list_tags_for_resource_with_retries(ResourceName=resource_arn) return resp.get("TagList", []) except botocore.exceptions.ClientError as ex: if getattr(ex, "response", {}).get("Error", {}).get("Code", "") == "InvalidParameterValue": return [] raise_exception("Can not list rds tags for resource {}, {}", resource_arn, ex)
def is_completed(self, start_data): def task_is_triggered_by_tag_event(): task_change_events = self._events_.get( handlers.ec2_tag_event_handler.EC2_TAG_EVENT_SOURCE, {}).get(handlers.TAG_CHANGE_EVENT, []) return handlers.ec2_tag_event_handler.EC2_CHANGED_INSTANCE_TAGS_EVENT in task_change_events def tags_to_delete(): tags = {} tags_on_instance = self.instance.get("Tags", {}) for t in list(tags_on_instance.keys()): if (self.scale_up_tagfilter and t in self.scale_up_tagfilter.get_filter_keys()) or \ (self.scale_down_tagfilter and t in self.scale_down_tagfilter.get_filter_keys()): self._logger_.info( INF_REMOVE_TAG.format({t: tags_on_instance[t]}, self.instance_id)) tags[t] = tagging.TAG_DELETE return tags def delete_up_down_filter_tags(): tags = tags_to_delete() if len(tags) > 0: tagging.set_ec2_tags(ec2_client=self.ec2_client, tags=tags, can_delete=True, logger=self._logger_, resource_ids=[self.instance_id]) def set_tags_on_resized_instance(new_instance_type, original_type): # tags set by action tags = self.build_tags_from_template( parameter_name=PARAM_RESIZED_INSTANCE_TAGS, tag_variables={ TAG_PLACEHOLDER_NEW_INSTANCE_TYPE: new_instance_type, TAG_PLACEHOLDER_ORG_INSTANCE_TYPE: original_type }) try: # if task is triggered by tagging event if task_is_triggered_by_tag_event(): # up or down tags filters should not match new tags as it would re-trigger execution of the task if self.resize_mode == RESIZE_BY_STEP: for t in list(tags.keys()): # remove tags that match up or down tag filters if (self.scale_up_tagfilter and t in self.scale_up_tagfilter.get_filter_keys()) or \ (self.scale_down_tagfilter and t in self.scale_down_tagfilter.get_filter_keys()): self._logger_.info( INF_TAGS_NOT_SET_STEP.format( {t: tags[t]}, self.instance_id)) del tags[t] tags.update(tags_to_delete()) self.set_ec2_instance_tags_with_event_loop_check( client=self.ec2_client, instance_ids=[self.instance_id], tags_to_set=tags) except Exception as tag_ex: raise_exception(ERR_SET_TAGS, self.instance_id, tag_ex) resized = not start_data.get("not-resized", False) need_start = start_data.get("instance-running", True) if not resized and not need_start: delete_up_down_filter_tags() self._logger_.info(INF_STOPPED_INSTANCE, self.instance_id) return self.result if not need_start and resized: set_tags_on_resized_instance( start_data["new-instance-type"], start_data.get("org-instance-type", "")) return self.result # get current state of instance instance = self._get_instance() self._logger_.debug("Instance data is {}", safe_json(instance, indent=3)) state_code = instance["StateCode"] & 0xFF # resized instance is running, done... if state_code == EC2_STATE_RUNNING: # instance is running self._logger_.info(INF_INSTANCE_RUNNING, self.instance_id) if resized: set_tags_on_resized_instance( instance["InstanceType"], start_data.get("org-instance-type", "")) else: delete_up_down_filter_tags() return self.result # in pending state, wait for next completion check if state_code == EC2_STATE_PENDING: return None raise_exception(ERR_INSTANCE_NOT_IN_STARTING_STATE, self.instance_id, instance)
def flush(self, timeout_event=None): """ Writes all cached action items in batches to the dynamodb table :return: """ items_to_write = [] has_failed_items_to_retry = False tasks_data = {} # create items to write to table for item in self._new_action_items: task_name = item[handlers.TASK_TR_NAME] if task_name in tasks_data: tasks_data[task_name]["count"] += 1 else: tasks_data[task_name] = {"count": 1, "task_level_metrics": item[handlers.TASK_TR_METRICS]} items_to_write.append( { "PutRequest": { "Item": build_record(item) } }) if len(tasks_data) > 0: with TaskMetrics(dt=datetime.utcnow(), logger=self._logger, context=self._context)as task_metrics: for name in tasks_data: # number of submitted task instances for task task_metrics.put_task_state_metrics(task_name=name, metric_state_name=metrics.METRICS_STATUS_NAMES[handlers.STATUS_PENDING], count=tasks_data[name]["count"], task_level=tasks_data[name]["task_level_metrics"]) if timeout_event is not None and timeout_event.is_set(): return # buffer to hold a max of 25 items to write in a batch batch_write_items = [] # write until all items are written while len(items_to_write) > 0 and (not (timeout_event.is_set() if timeout_event is not None else False)): try: batch_write_items.append(items_to_write.pop(0)) if len(batch_write_items) == 25 or len(items_to_write) == 0: putrequest = {self._action_table.name: batch_write_items} resp = self._dynamodb_client.batch_write_item_with_retries(RequestItems=putrequest) # unprocessed items are put back in the list of items to write unprocessed_items = resp.get("UnprocessedItems", []) has_failed_items_to_retry = has_failed_items_to_retry or len(unprocessed_items) > 0 for unprocessed_item in unprocessed_items: has_failed_items_to_retry = True items_to_write += unprocessed_items[unprocessed_item] batch_write_items = [] sleep(1) except Exception as ex: # when there are items that are retried if has_failed_items_to_retry: raise_exception(ERR_ITEMS_NOT_WRITTEN, ",".join([str(i) for i in items_to_write]), str(ex)) if self._run_local: for i in self._new_action_items: TaskTrackingTable._run_local_stream_event(os.getenv(handlers.ENV_ACTION_TRACKING_TABLE), "INSERT", new_item=i, context=self._context) self._new_action_items = []
def is_completed(self, snapshot_create_data): def delete_source_after_copy(): self._logger_.info(INF_DELETING_SNAPSHOT, self.source_snapshot_id) self.ec2_source_client.delete_snapshot_with_retries( SnapshotId=self.source_snapshot_id) self._logger_.info(INF_SNAPSHOT_DELETED, self.source_snapshot_id, self.source_region) def source_tags(copy_id, source_tags_param): snapshot_tags = {} snapshot_tags.update( self.build_tags_from_template( parameter_name=source_tags_param, region=self.source_region, tag_variables={ TAG_PLACEHOLDER_COPIED_SNAPSHOT_ID: copy_id, TAG_PLACEHOLDER_COPIED_REGION: self._destination_region_ })) return snapshot_tags def set_source_snapshot_tags(copy_id): snapshot_tags = source_tags(copy_id, PARAM_SOURCE_TAGS) if len(snapshot_tags) == 0: return self._logger_.info(INF_CREATE_SOURCE_TAGS, snapshot_tags, self._account_) if len(snapshot_tags) > 0: tagging.set_ec2_tags(ec2_client=self.ec2_source_client, resource_ids=[self.source_snapshot_id], tags=snapshot_tags, logger=self._logger_) self._logger_.info(INF_TAGS_CREATED) def grant_create_volume_permissions(snap_id): if self.accounts_with_create_permissions is not None and len( self.accounts_with_create_permissions) > 0: args = { "CreateVolumePermission": { "Add": [{ "UserId": a.strip() } for a in self.accounts_with_create_permissions] }, "SnapshotId": snap_id } try: self.ec2_destination_client.modify_snapshot_attribute_with_retries( **args) self._logger_.info( INF_SETTING_CREATE_VOLUME_PERMISSIONS, ", ".join(self.accounts_with_create_permissions)) except Exception as ex: raise_exception(ERR_SETTING_CREATE_VOLUME_PERMISSIONS, self.accounts_with_create_permissions, ex) def tag_shared_snapshots(tags, snap_id): # creates tags for snapshots that have been shared in account the snapshots are shared with if len(tags) == 0 or not self.tag_snapshots_in_shared_accounts: return if self.accounts_with_create_permissions in ["", None]: return for account in self.accounts_with_create_permissions: session_for_tagging = self.get_action_session( account=account, param_name=PARAM_DESTINATION_ACCOUNT_TAG_ROLENAME, logger=self._logger_) if session_for_tagging is None: self._logger_.error(ERR_TAGS_NOT_SET_IN_ACCOUNT, account) continue try: ec2_client = get_client_with_retries( service_name="ec2", methods=["create_tags", "delete_tags"], context=self._context_, region=self.get(PARAM_DESTINATION_REGION), session=session_for_tagging, logger=self._logger_) tagging.set_ec2_tags(ec2_client=ec2_client, resource_ids=[snap_id], tags=tags, logger=self._logger_) self._logger_.info(INF_CREATE_SHARED_TAGS, tags, account) except Exception as ex: raise_exception(ERR_SETTING_SHARED_TAGS, account, str(ex)) def tag_shared_source_snapshot(copy_id): # created tags for snapshots for shared snapshots in the source account of the shares snapshots snapshot_tags = source_tags(copy_id, PARAM_SOURCE_SHARED_BY_TAGS) if len(snapshot_tags ) == 0 or not self.tag_snapshots_in_source_account: return # only for snapshots that have been shared by other account if self.owner == self.get_account_for_task(): self._logger_.debug( "Account {} is owner, no tags set for snapshot {} in account of owner", self._account_, self.source_snapshot_id) return session_for_tagging = self.get_action_session( account=self.owner, param_name=PARAM_SOURCE_ACCOUNT_TAG_ROLE_NAME, logger=self._logger_) if session_for_tagging is None: self._logger_.error(ERR_TAGS_NOT_SET_IN_ACCOUNT, self.owner) return try: self._logger_.info(INF_CREATE_SHARED_ACCOUNT_SNAPSHOT_TAGS, snapshot_tags, self.source_snapshot_id, self.owner) ec2_client = get_client_with_retries( service_name="ec2", methods=["create_tags", "delete_tags"], context=self._context_, region=self.source_region, session=session_for_tagging, logger=self._logger_) tagging.set_ec2_tags(ec2_client=ec2_client, resource_ids=[self.source_snapshot_id], tags=snapshot_tags, logger=self._logger_) except Exception as ex: raise_exception(ERR_SETTING_SOURCE_SHARED_TAGS, self.owner, str(ex)) if snapshot_create_data.get("already-copied", False): self._logger_.info(INF_COMPLETE_ALREADY_COPIED, self.source_snapshot_id) return self.result if snapshot_create_data.get("not-longer-available", False): self._logger_.info(INF_COMPLETED_NOT_LONGER_AVAILABLE, self.source_snapshot_id) return self.result # create service instance to test if snapshot exists ec2 = services.create_service( "ec2", session=self._session_, service_retry_strategy=get_default_retry_strategy( "ec2", context=self._context_)) copy_snapshot_id = snapshot_create_data["copy-snapshot-id"] # test if the snapshot with the id that was returned from the CopySnapshot API call exists and is completed copied_snapshot = ec2.get(services.ec2_service.SNAPSHOTS, region=self._destination_region_, OwnerIds=["self"], Filters=[{ "Name": "snapshot-id", "Values": [copy_snapshot_id] }]) if copied_snapshot is not None: self._logger_.debug(INF_CHECK_COMPLETED_RESULT, copied_snapshot) state = copied_snapshot[ "State"] if copied_snapshot is not None else None if copied_snapshot is None or state == SNAPSHOT_STATE_PENDING: self._logger_.info(INF_COPY_PENDING, copy_snapshot_id, self._destination_region_) return None if state == SNAPSHOT_STATE_ERROR: copied_tag_name = Ec2CopySnapshotAction.marker_tag_copied_to( self._task_) self.ec2_source_client.delete_tags_with_retries( Resources=[self.source_snapshot_id], Tags=[{ "Key": copied_tag_name }]) raise_exception(ERR_COPY_SNAPSHOT) if state == SNAPSHOT_STATE_COMPLETED: self._logger_.info(INF_COPY_COMPLETED, self.source_snapshot_id, self.source_region, copy_snapshot_id, self._destination_region_) grant_create_volume_permissions(copy_snapshot_id) tag_shared_snapshots(snapshot_create_data.get("tags", {}), copy_snapshot_id) tag_shared_source_snapshot(copy_snapshot_id) if self.delete_after_copy: delete_source_after_copy() else: set_source_snapshot_tags(copy_snapshot_id) # wait there for 15 seconds as count the limit for max number of concurrent snapshot copies # by the EC2 service is sometimes delayed time.sleep(5) return copied_snapshot return None
def handle_request(self): """ Handles the select resources request. Creates new actions for resources found for a task :return: Results of handling the request """ def filter_by_action_filter(srv, used_role, r): filter_method = getattr(self.action_class, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, None) if filter_method is not None: r = filter_method(srv, self._logger, self._resource_name, r, self._context, self.task, used_role) if r is None: self._logger.debug( DEBUG_FILTER_METHOD, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD) return None else: self._logger.debug( DEBUG_FILTERED_RESOURCE, self.action_class.__name__, actions.SELECT_AND_PROCESS_RESOURCE_METHOD, safe_json(r, indent=3)) return r def is_selected_resource(aws_service, resource, used_role, taskname, tags_filter, does_resource_supports_tags): # No tags then just use filter method if any if not does_resource_supports_tags: self._logger.debug(DEBUG_RESOURCE_NO_TAGS, resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) tags = resource.get("Tags", {}) # name of the tag that holds the list of tasks for this resource tagname = self._task_tag if tags_filter is None: # test if name of the task is in list of tasks in tag value if (tagname not in tags) or (taskname not in tagging.split_task_list( tags[tagname])): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED, safe_json(resource, indent=2), taskname, ','.join( ["'{}'='{}'".format(t, tags[t]) for t in tags])) return None self._logger.debug(DEBUG_SELECTED_BY_TASK_NAME_IN_TAG_VALUE, safe_json(resource, indent=2), tagname, taskname) else: # using a tag filter, * means any tag if tags_filter != tagging.tag_filter_set.WILDCARD_CHAR: # test if there are any tags matching the tag filter if not TagFilterExpression(tags_filter).is_match(tags): self._logger.debug( DEBUG_RESOURCE_NOT_SELECTED_TAG_FILTER, safe_json(resource, indent=2), taskname, ','.join([ "'{}'='{}'".format(t, tags[t]) for t in tags ])) return None self._logger.debug(DEBUG_SELECTED_BY_TAG_FILTER, safe_json(resource, indent=2), tags, tag_filter_str, taskname) else: self._logger.debug(DEBUG_SELECTED_WILDCARD_TAG_FILTER, safe_json(resource, indent=2), taskname) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) return filter_by_action_filter(srv=aws_service, used_role=used_role, r=resource) def resource_batches(resources): """ Returns resources as chunks of size items. If the class has an optional custom aggregation function then the resources are aggregated first using this function before applying the batch size :param resources: resources to process :return: Generator for blocks of resource items """ aggregate_func = getattr(self.action_class, actions.CUSTOM_AGGREGATE_METHOD, None) for i in aggregate_func( resources, self.task_parameters, self._logger) if aggregate_func is not None else [ resources ]: if self.batch_size is None: yield i else: first = 0 while first < len(i): yield i[first:first + self.batch_size] first += self.batch_size def setup_tag_filtering(t_name): # get optional tag filter no_select_by_tags = self.action_properties.get( actions.ACTION_NO_TAG_SELECT, False) if no_select_by_tags: tag_filter_string = tagging.tag_filter_set.WILDCARD_CHAR else: tag_filter_string = self.task.get(handlers.TASK_TAG_FILTER) # set if only a single task is required for selecting the resources, it is used to optimise the select select_tag = None if tag_filter_string is None: self._logger.debug(DEBUG_SELECT_BY_TASK_NAME, self._resource_name, self._task_tag, t_name) select_tag = self._task_tag elif tag_filter_string == tagging.tag_filter_set.WILDCARD_CHAR: self._logger.debug(DEBUG_SELECT_ALL_RESOURCES, self._resource_name) else: self._logger.debug(DEBUG_TAG_FILTER_USED_TO_SELECT_RESOURCES, self._resource_name) # build the tag expression that us used to filter the resources tag_filter_expression = TagFilterExpression(tag_filter_string) # the keys of the used tags tag_filter_expression_tag_keys = list( tag_filter_expression.get_filter_keys()) # if there is only a single tag then we can optimize by just filtering on that specific tag if len(tag_filter_expression_tag_keys) == 1 and \ tagging.tag_filter_set.WILDCARD_CHAR not in tag_filter_expression_tag_keys[0]: select_tag = tag_filter_expression_tag_keys[0] return select_tag, tag_filter_string def add_aggregated(aggregated_resources): # create tasks action for aggregated resources , optionally split in batch size chunks for ra in resource_batches(aggregated_resources): if self._check_can_execute(ra): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ra, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADDED_AGGREGATED_RESOURCES_TASK, action_item[handlers.TASK_TR_ID], len(ra), self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item def add_as_individual(resources): for ri in resources: # task action for each selected resource if self._check_can_execute([ri]): action_item = self.actions_tracking.add_task_action( task=self.task, assumed_role=assumed_role, action_resources=ri, task_datetime=self.task_dt, source=self.source, task_group=self.task_group) self._logger.debug(DEBUG_ADD_SINGLE_RESOURCE_TASK, action_item[handlers.TASK_TR_ID], self._resource_name, self.task[handlers.TASK_NAME]) self._logger.debug("Added item\n{}", safe_json(action_item, indent=3)) yield action_item try: task_items = [] start = datetime.now() self._logger.debug(DEBUG_EVENT, safe_json(self._event, indent=3)) self._logger.debug(DEBUG_ACTION, safe_json(self.action_properties, indent=3)) self._logger.info(INFO_SELECTED_RESOURCES, self._resource_name, self.service, self.task[handlers.TASK_NAME]) self._logger.info(INFO_AGGR_LEVEL, self.aggregation_level) task_level_aggregated_resources = [] args = self._build_describe_argument() service_resource_with_tags = services.create_service( self.service).resources_with_tags if self._resource_name == "": supports_tags = len(service_resource_with_tags) != 0 else: supports_tags = self._resource_name.lower() in [ r.lower() for r in service_resource_with_tags ] args["tags"] = supports_tags self._logger.info(INFO_USE_TAGS_TO_SELECT, "R" if supports_tags else "No r") task_name = self.task[handlers.TASK_NAME] count_resource_items = 0 selected_resource_items = 0 select_on_tag, tag_filter_str = setup_tag_filtering(task_name) filter_func = getattr(self.action_class, actions.FILTER_RESOURCE_METHOD, None) # timer to guard selection time and log warning if getting close to lambda timeout if self._context is not None: self.start_timer(REMAINING_TIME_AFTER_DESCRIBE) try: for assumed_role in self._task_assumed_roles(): retry_strategy = get_default_retry_strategy( service=self.service, context=self._context) service = services.create_service( service_name=self.service, service_retry_strategy=retry_strategy, role_arn=assumed_role) if self.is_timed_out(): break # contains resources for account account_level_aggregated_resources = [] self._logger.info(INFO_ACCOUNT, service.aws_account) if assumed_role not in [None, ""]: self._logger.info(INFO_ASSUMED_ROLE, assumed_role) for region in self._regions: # test for timeouts if self.is_timed_out(): break # handle region passed in the event if region is not None: args["region"] = region else: if "region" in args: del args["region"] # resources can be passed in the invent by event handlers all_resources = self._event.get( handlers.HANDLER_SELECT_RESOURCES, None) if all_resources is None: # actions can have an optional method to select resources action_custom_describe_function = getattr( self.action_class, "describe_resources", None) if action_custom_describe_function is not None and self.use_custom_select: all_resources = action_custom_describe_function( service, self.task, region) else: # select resources from the service self._logger.debug(DEBUG_SELECT_PARAMETERS, self._resource_name, self.service, args) # selecting a list of all resources in this account/region all_resources = list( service.describe( self._resource_name, filter_func=filter_func, select_on_tag=select_on_tag, **args)) # test for timeout if self.is_timed_out(): break count_resource_items += len(all_resources) self._logger.info(INFO_RESOURCES_FOUND, len(all_resources)) # select resources that are processed by the task selected_resources = [] for sr in all_resources: sel = is_selected_resource( aws_service=service, resource=sr, used_role=assumed_role, taskname=task_name, tags_filter=tag_filter_str, does_resource_supports_tags=supports_tags) if sel is not None: selected_resources.append(sel) selected_resource_items += len(selected_resources) # display found and selected resources if len(all_resources) > 0: self._logger.info(INFO_RESOURCES_SELECTED, len(selected_resources)) if len(selected_resources) == 0: continue # delete tags if not needed by the action if not self.keep_tags: for res in selected_resources: if "Tags" in res: del res["Tags"] # add resources to total list of resources for this task if self.aggregation_level == actions.ACTION_AGGREGATION_TASK: task_level_aggregated_resources += selected_resources # add resources to list of resources for this account if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT: account_level_aggregated_resources += selected_resources # add batch(es) of resources for this region if self.aggregation_level == actions.ACTION_AGGREGATION_REGION and len( selected_resources) > 0: task_items += list( add_aggregated(selected_resources)) # no aggregation, add each individual resource if self.aggregation_level == actions.ACTION_AGGREGATION_RESOURCE and len( selected_resources) > 0: task_items += list( add_as_individual(selected_resources)) # at the end of the region loop, check if aggregated resources for account need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_ACCOUNT and len( account_level_aggregated_resources) > 0: task_items += list( add_aggregated(account_level_aggregated_resources)) # at the end of the accounts loop, check if aggregated resources for task need to be added if self.aggregation_level == actions.ACTION_AGGREGATION_TASK and len( task_level_aggregated_resources) > 0: task_items += list( add_aggregated(task_level_aggregated_resources)) except Exception as ex: raise_exception(ERR_SELECTING_TASK_RESOURCES, self.task[handlers.TASK_NAME], ex) finally: if self._timer is not None: # cancel time used avoid timeouts when selecting resources self._timer.cancel() if self.is_timed_out(): raise_exception(ERR_TIMEOUT_SELECTING_RESOURCES, self._resource_name, self.service, task_name) self.start_timer(REMAINING_TIME_AFTER_STORE) self.actions_tracking.flush(self._timeout_event) if self.is_timed_out(): raise_exception( ERR_CREATING_TASKS_FOR_SELECTED_RESOURCES, task_name) self._timer.cancel() else: self.actions_tracking.flush() self._logger.info(INFO_ADDED_ITEMS, len(task_items), self.task[handlers.TASK_NAME]) running_time = float((datetime.now() - start).total_seconds()) self._logger.info(INFO_RESULT, running_time) if self.metrics: put_task_select_data(task_name=task_name, items=count_resource_items, selected_items=selected_resource_items, logger=self._logger, selection_time=running_time) return safe_dict({ "datetime": datetime.now().isoformat(), "running-time": running_time, "dispatched-tasks": task_items }) finally: self._logger.flush()