def does_athena_table_exist(run_id, athena_database_name, athena_table_name): """ Checks if an Athena table already exists Arguments: run_id {string} -- run_id for the current Step Function execution athena_database_name {string} -- Athena database to use for query execution athena_table_name {string} -- Athena table name Returns: boolean -- Flag representing if the table already exists """ exists = False athena_client = awshelper.get_client(awshelper.ServiceName.athena) execution_id = None try: execution_id = run_query(run_id, athena_client, f'SHOW TABLES IN {athena_database_name}', None, True) except utility.S3InsightsException as e: logging.info('received exception while listing tables: {e}') if execution_id is not None: result = athena_client.get_query_results(QueryExecutionId=execution_id) for row in result['ResultSet']['Rows']: table_name = row['Data'][0]['VarCharValue'] if utility.compare_strings(athena_table_name, table_name): exists = True break return exists
def is_notification_queue_empty(): """ Checks if the notification queue is empty Returns: boolean -- Flag representing if the notification queue is empty """ queue_arn_parts = config.DeploymentDetails.sqs_arn.split(sep=':') queue_name = queue_arn_parts[len(queue_arn_parts) - 1] sqs_client = awshelper.get_client(awshelper.ServiceName.sqs) response = sqs_client.get_queue_url(QueueName=queue_name) sqs_url = response['QueueUrl'] approx_msg_count_attribute_name = 'ApproximateNumberOfMessages' approx_not_visible_msg_count_attribute_name = 'ApproximateNumberOfMessagesNotVisible' approx_delayed_msg_count_attribute_name = 'ApproximateNumberOfMessagesDelayed' response = sqs_client.get_queue_attributes( QueueUrl=sqs_url, AttributeNames=[ approx_msg_count_attribute_name, approx_not_visible_msg_count_attribute_name, approx_delayed_msg_count_attribute_name ]) logging.info(f'SQS attributes:{response}') attributes = response['Attributes'] if attributes[approx_msg_count_attribute_name] == '0' and attributes[ approx_not_visible_msg_count_attribute_name] == '0' and attributes[ approx_delayed_msg_count_attribute_name] == '0': return True return False
def delete_regional_s3_inventory_bucket(input_parameters, region): """ Delete the staging destination bucket for the given region Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution region {string} -- AWS region name """ s3_resource = awshelper.get_resource(awshelper.ServiceName.s3) bucket_name = get_destination_bucket_name(input_parameters.run_id, region) logging.info(f'deleting all objects from {region} {bucket_name}') try: bucket = s3_resource.Bucket(bucket_name) bucket.objects.all().delete() except ClientError as e: logging.error(f'error while deleting all s3 objects from inventory destination bucket {bucket_name}. error details: {e}') cloudformation_client = awshelper.get_client( awshelper.ServiceName.cloudformation, region) stack_name = get_stack_name(input_parameters.run_id) logging.info(f'deleting cloudformation stack {stack_name} from {region}') try: response = cloudformation_client.delete_stack( StackName=stack_name) logging.info(response) except ClientError as e: logging.error(f'error while deleting inventory destnation bucket stack {stack_name} for {region} region. error details: {e}')
def get_athena_queries(): """ Get Athena queries from the configuration table Returns: list(AthenaDetails) -- Athena query details """ table = get_table() response = table.query(KeyConditionExpression=Key( TableFieldName.partitionkey).eq(TableValueCategory.athena_query)) details = [] athena_client = awshelper.get_client(awshelper.ServiceName.athena) for item in response['Items']: query_execution_id = item[TableFieldName.sortkey] get_query_execution_response = athena_client.get_query_execution( QueryExecutionId=query_execution_id) query_execution_result = get_query_execution_response['QueryExecution'] s3_output_location = query_execution_result['ResultConfiguration'][ 'OutputLocation'] execution_state = query_execution_result['Status']['State'] actual_query = query_execution_result['Query'] details.append( AthenaDetails(query_execution_id, item[TableFieldName.name], item[TableFieldName.field_value], s3_output_location, execution_state, actual_query)) return details
def simulate(input_parameters): """ Simulate smoke test Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution Raises: utility.S3InsightsException: If none of the source buckets can generator inventory reports """ logging.info('simulating smoke test in the current environment') account_id = None region = None bucket_name = None account_ids = [ account_config.id for account_config in input_parameters.accounts ] for account_id in account_ids: source_buckets_details = ddb.get_source_buckets_details(account_id) for ddb_bucket in source_buckets_details: inventory_status = ddb_bucket[ddb.TableFieldName.inventory_status] if utility.compare_strings(inventory_status, ddb.BucketInventoryStatus.in_progress): account_id = ddb_bucket[ddb.TableFieldName.account_id] region = ddb_bucket[ddb.TableFieldName.region] bucket_name = ddb_bucket[ddb.TableFieldName.sortkey] break if account_id is not None: break if bucket_name is None: raise utility.S3InsightsException( 'could not find a bucket for smoke test') s3_client = awshelper.get_client(awshelper.ServiceName.s3) file_path = utility.get_file_path( __file__, "smoketestdata/sample_inventory_object.orc") s3_key = "{0}/{1}/{2}/inventorysmoketest/data/smoke_test_inventory_object.orc".format( account_id, region, bucket_name) destination_bucket_name = s3.get_destination_bucket_name( input_parameters.run_id, region) logging.info( f'smoke test destination_bucket_name:{destination_bucket_name} s3_key:{s3_key}' ) response = s3_client.upload_file(file_path, destination_bucket_name, s3_key) logging.info(f'uploading a sample inventory object. response:{response}') s3_key = "{0}/{1}/{2}/inventorysmoketest/somedate/manifest.checksum".format( account_id, region, bucket_name) logging.info(response) sleep_time_in_seconds = config.ServiceParameters.smoke_test_sleep_time_in_seconds time.sleep(sleep_time_in_seconds) response = s3_client.upload_file(file_path, destination_bucket_name, s3_key) logging.info(f'uploading a sample manifest checksum. response:{response}') time.sleep(sleep_time_in_seconds)
def create_inventory_destination_buckets(input_parameters): """ Create inventory destination buckets Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution """ template_file_path = utility.get_file_path(__file__, "template/inventory-destination.json") with open(template_file_path, "r") as template_file: template_text = template_file.read() stacks = [] regions = input_parameters.supported_regions for region in regions: bucket_name = get_destination_bucket_name( input_parameters.run_id, region) topic_name = utility.get_resource_name( input_parameters.run_id, 'sns', 'notification-topic') acceleration_status = 'Enabled' parameters = [ { 'ParameterKey': 'BucketName', 'ParameterValue': bucket_name }, { 'ParameterKey': 'SQSArn', 'ParameterValue': config.DeploymentDetails.sqs_arn }, { 'ParameterKey': 'TopicName', 'ParameterValue': topic_name }, { 'ParameterKey': 'AccelerationStatus', 'ParameterValue': acceleration_status } ] stack_name = get_stack_name(input_parameters.run_id) cloudformation_client = awshelper.get_client( awshelper.ServiceName.cloudformation, region) response = cloudformation_client.create_stack( StackName=stack_name, TemplateBody=template_text, Parameters=parameters) logging.info(f'create stack response: {response}') stacks.append(StackDetails(cloudformation_client, stack_name)) wait_for_stack_operations_to_finish( stacks, 'create_in_progress', 'create_complete', 20)
def send_email_verification_request(email_address): """ Send email verification request for a specific email address Arguments: email_address {string} -- Email address """ if is_email_address_verified(email_address) == False: ses_client = awshelper.get_client(awshelper.ServiceName.ses) response = ses_client.verify_email_identity(EmailAddress=email_address) logging.info('ses verify email api response:{0}'.format(response))
def remove_bucket_inventory_configuration(run_id, account_id, region, bucket_name): """ Remove inventory configuration from the given S3 bucket Arguments: run_id {string} -- run_id for the current Step Function execution account_id {string} -- AWS account id region {string} -- AWS region name bucket_name {string} -- Bucket name """ s3_client = awshelper.get_client( awshelper.ServiceName.s3, region, account_id, run_id) remove_bucket_inventory_configuration_internal(s3_client, run_id, account_id, region, bucket_name)
def get_verified_identities(identity_type): """ Return verified identities Arguments: identity_type {string} -- EmailAddress/Domain """ ses_client = awshelper.get_client(awshelper.ServiceName.ses) verified_identities = [] paginator = ses_client.get_paginator('list_identities') response_iterator = paginator.paginate(IdentityType=identity_type, PaginationConfig={ 'MaxItems': 1000, 'PageSize': 100 }) for response in response_iterator: verified_identities.extend(response['Identities']) return verified_identities
def get_source_buckets(input_parameters, account_id): """ Get all eligible source buckets Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution account_id {string} -- AWS account id Returns: dict<string, dict<string, list(string)>> -- Source buckets """ source_buckets = {} account_id = account_id.lower() account_config = next(account_config for account_config in input_parameters.accounts if utility.compare_strings(account_config.id, account_id)) source_buckets[account_id] = {} s3_client = awshelper.get_client( awshelper.ServiceName.s3, None, account_id, input_parameters.run_id) # Exclude the consolidation and inventory destination buckets pipeline_buckets = [] if utility.compare_strings(account_id, awshelper.SessionManager.get_host_account_id()): pipeline_buckets.append(config.DeploymentDetails.consolidated_inventory_bucket_name) for region in input_parameters.supported_regions: bucket_name = get_destination_bucket_name(input_parameters.run_id, region) pipeline_buckets.append(bucket_name) response = s3_client.list_buckets() for bucket in response["Buckets"]: name = bucket["Name"].lower() if name not in account_config.exclude and name not in pipeline_buckets: try: location = s3_client.get_bucket_location( Bucket=name) region = location['LocationConstraint'] if region is None: region = 'us-east-1' region = region.lower() if region in input_parameters.supported_regions: if region not in source_buckets[account_id]: source_buckets[account_id][region] = [] source_buckets[account_id][region].append(name) except ClientError as e: logging.error(f'error while retrieving bucket information for {account_id}:{name}. error details: {e}') return source_buckets
def remove_bucket_inventory_configurations(input_parameters, source_buckets_ddb): """ Remove inventory configurations from the given list of source buckets Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution source_buckets_ddb {dict<string, dict<string, list(string)>>} -- Source buckets """ for account_id in source_buckets_ddb: for region in source_buckets_ddb[account_id]: s3_client = awshelper.get_client( awshelper.ServiceName.s3, region, account_id, input_parameters.run_id) for bucket in source_buckets_ddb[account_id][region]: bucket_name = bucket.name remove_bucket_inventory_configuration_internal( s3_client, input_parameters.run_id, account_id, region, bucket_name)
def send_welcome_email(): """ Send welcome email """ queries = ddb.get_athena_queries() query_details_html = ''' <html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } .success { background-color: rgba(0, 255, 0, 0.2); } .failed { background-color: rgba(255, 0, 0, 0.2); } .neutral { background-color:white; } </style> </head> <body> <body> <p> Your latest <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#how-to-initiate-a-state-machine-execution">S3Insights Harvester execution</a> generated this welcome email. You can learn more about the platform <a href="https://github.com/kurmiashish/S3Insights">here</a>. </p> ''' intro_html = 'In this run, the following Athena queries were executed. You can run additional Athena queries manually by following <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#running-athena-analysis-queries-manually">these instructions</a>. Please refer to the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#athena-failures">Athena troubleshooting document</a> if any of the following Athena queries have failed.' input_parameters = ddb.get_input_parameters() if input_parameters.is_smoke_test: intro_html = intro_html + ' <b>As this is a smoke test run, the following links may not work as the platform may have deleted the Athena resources.</b>' query_details_html = query_details_html + "<h4>Analysis Queries</h4><p>" + intro_html + "</p>" query_details_html = query_details_html + ''' <table> <tr> <th>Name</th> <th>Query</th> <th>Status</th> <th>Execution Details</th> </tr> ''' succeeded_status_value = 'succeeded' done_status_value = 'done' bucket_is_empty_status_value = 'bucket_is_empty' everything_else_status_value = 'everything_else' success_css_class_name = 'success' failed_css_class_name = 'failed' neutral_css_class_name = 'neutral' css_mappings = { succeeded_status_value: success_css_class_name, done_status_value: success_css_class_name, everything_else_status_value: failed_css_class_name, bucket_is_empty_status_value: neutral_css_class_name } for state in [succeeded_status_value, everything_else_status_value]: for query in queries: should_include = False if not utility.compare_strings(state, everything_else_status_value): should_include = utility.compare_strings(state, query.state) else: should_include = not utility.compare_strings( succeeded_status_value, query.state) if should_include: css_class_name = css_mappings[state] query_web_console_link = 'https://console.aws.amazon.com/athena/home?region={0}#query/history/{1}'.format( config.DeploymentDetails.region, query.query_execution_id) query_web_console_link_html = '<a href={0}> Web Console Link </a>'.format( query_web_console_link) query_details_html = query_details_html + f'<tr class="{css_class_name}"><td>' + ' </td><td>'.join( [ query.query_name, query.actual_query, query.state, query_web_console_link_html ]) + '</td></tr>' query_details_html = query_details_html + '</table><br>' bucket_html_table = ''' <h4>Source buckets</h4> <p> The following buckets are included in the analysis. If the platform failed to generate inventory for any of the buckets (i.e., if any entry in the following table is highlighted in Red), please consult the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#inventory-generation-failures">inventory generation troubleshooting document</a>. </p> <table> <tr> <th>Account</th> <th>Region</th> <th>Bucket</th> <th>Inventory Status</th> </tr> ''' source_buckets = ddb.get_source_buckets() for account_id in source_buckets: # Let's calculate the value for rowspan account_row_span = sum([ len(source_buckets[account_id][region]) for region in source_buckets[account_id] ]) inserted_account_row = False for region in source_buckets[account_id]: region_row_span = len(source_buckets[account_id][region]) inserted_region_row = False for inventory_status in [ done_status_value, bucket_is_empty_status_value, everything_else_status_value ]: for bucket in source_buckets[account_id][region]: should_include = False if not utility.compare_strings( inventory_status, everything_else_status_value): should_include = utility.compare_strings( inventory_status, bucket.inventory_status) else: already_included = utility.compare_strings( done_status_value, bucket.inventory_status ) or utility.compare_strings( bucket_is_empty_status_value, bucket.inventory_status) should_include = not already_included if should_include: css_class_name = css_mappings[inventory_status] row = "<tr>" if not inserted_account_row: inserted_account_row = True row = row + "<td rowspan={0}>{1}</td>".format( account_row_span, account_id) if not inserted_region_row: inserted_region_row = True row = row + "<td rowspan={0}>{1}</td>".format( region_row_span, region) row = row + f'<td class="{css_class_name}">{bucket.name}</td>' row = row + f'<td class="{css_class_name}">{bucket.inventory_status}</td></tr>' bucket_html_table = bucket_html_table + row bucket_html_table = bucket_html_table + "</table>" query_details_html = query_details_html + bucket_html_table input_parameters_str = json.dumps( input_parameters, default=lambda input_parameters: input_parameters.__dict__, sort_keys=True, indent=4, separators=(',', ': ')) input_parameters_section = ''' <br> <h4>Input Parameters</h4> <p> <div style="white-space: pre-wrap;"> The execution parameters used for this run are given below. {0} </div> </p> '''.format(input_parameters_str) query_details_html = query_details_html + input_parameters_section + '</body></html>' logging.info(f'welcome email content:{query_details_html}') input_parameters = ddb.get_input_parameters() ses_client = awshelper.get_client(awshelper.ServiceName.ses) response = ses_client.send_email( Destination={ 'ToAddresses': input_parameters.recipient_email_addresses, }, Message={ 'Body': { 'Html': { 'Charset': 'UTF-8', 'Data': query_details_html, }, 'Text': { 'Charset': 'UTF-8', 'Data': query_details_html, }, }, 'Subject': { 'Charset': 'UTF-8', 'Data': 'Your S3Insights snapshot is ready', }, }, Source=input_parameters.sender_email_address, ) logging.info(f'send email api response:{response}')
def cleanup_and_verify(): """ Cleanup smoke test resources and verify that smoke test worked as expected Raises: utility.S3InsightsException: If the test fails """ input_parameters = ddb.get_input_parameters() database_name = input_parameters.athena_database_name run_id = input_parameters.run_id is_manual_cleanup = ddb.is_manual_cleanup() athena_client = awshelper.get_client(awshelper.ServiceName.athena) try: athena.run_query( run_id, athena_client, 'DROP TABLE {0}'.format(input_parameters.athena_table_name), input_parameters.athena_database_name, True) except utility.S3InsightsException as e: logging.info('received exception while deleting athena table: {e}') if is_manual_cleanup: logging.info( 'ignoring the exception as this is a manual cleanup operation') else: raise try: athena.run_query( run_id, athena_client, 'DROP DATABASE {0}'.format(input_parameters.athena_database_name), None, True) except utility.S3InsightsException as e: logging.info('received exception while deleting athena table: {e}') if is_manual_cleanup: logging.info( 'ignoring the exception as this is a manual cleanup operation') else: raise s3_resource = awshelper.get_resource(awshelper.ServiceName.s3) s3_athena_output_prefix = athena.get_s3_output_location_prefix(run_id) consolidated_bucket = s3_resource.Bucket( config.DeploymentDetails.consolidated_inventory_bucket_name) athena_outout_objects = consolidated_bucket.objects.filter( Prefix=s3_athena_output_prefix) athena_outout_objects.delete() did_smoke_test_fail = False if len(run_id) > 0: s3_inventory_prefix = s3.get_inventory_prefix_at_consolidated_bucket( run_id) objects = consolidated_bucket.objects.filter( Prefix=s3_inventory_prefix) objects_count = 0 for obj in objects: objects_count += 1 logging.info( f'Number of objects that were created in the consolidation bucket:{objects_count}' ) objects = consolidated_bucket.objects.filter( Prefix=s3_inventory_prefix) objects.delete() if objects_count == 0: did_smoke_test_fail = True else: did_smoke_test_fail = True if is_manual_cleanup is not True and did_smoke_test_fail: raise utility.S3InsightsException( 'smoke test failed. Clean up operation itself might have succeeded.' )
def create_resources(): """ Create Athena resources once all inventory objects have been partitioned and stored in the consolidation bucket. """ input_parameters = ddb.get_input_parameters() database_name = input_parameters.athena_database_name table_name = input_parameters.athena_table_name athena_client = awshelper.get_client(awshelper.ServiceName.athena) create_database_query = 'CREATE DATABASE IF NOT EXISTS {0}'.format( database_name) logging.info(f'create database query={create_database_query}') run_id = input_parameters.run_id run_query(run_id, athena_client, create_database_query, None, True) athena_table_format = """ CREATE EXTERNAL TABLE {0}( bucket string, key string, version_id string, is_latest boolean, is_delete_marker boolean, size bigint, last_modified_date timestamp, e_tag string, storage_class string, is_multipart_uploaded boolean, replication_status string, encryption_status string, object_lock_retain_until_date timestamp, object_lock_mode string, object_lock_legal_hold_status string) PARTITIONED BY ( account string, region string, bucketname string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' LOCATION 's3://{1}/{2}' """ create_table_query = athena_table_format.format( table_name, config.DeploymentDetails.consolidated_inventory_bucket_name, s3.get_inventory_prefix_at_consolidated_bucket(run_id)) logging.info(f'create table query={create_table_query}') run_query(run_id, athena_client, create_table_query, database_name, True) run_query(run_id, athena_client, 'MSCK REPAIR TABLE {0}'.format(table_name), database_name, True) query_execution_details = {} for athena_query in input_parameters.athena_queries: execution_id = run_query( run_id, athena_client, athena_query.query.replace("{ATHENA_TABLE}", table_name), database_name, False) query_execution_details[execution_id] = athena_query logging.info('Execution Id: {0} Name: {1} Query:{2}'.format( execution_id, athena_query.name, athena_query.query)) ddb.store_athena_queries(query_execution_details)
def create_bucket_inventory_configurations(run_id, source_buckets): """ Enable S3 inventory for the given list of source buckets Arguments: run_id {string} -- run_id for the current Step Function execution source_buckets {dict<string, dict<string, list(string)>>} -- Source buckets """ host_account_id = awshelper.SessionManager.get_host_account_id() for account_id in source_buckets: for region in source_buckets[account_id]: s3_resource = awshelper.get_resource(awshelper.ServiceName.s3, account_id, run_id) s3_client = awshelper.get_client( awshelper.ServiceName.s3, region, account_id, run_id) for bucket_name in source_buckets[account_id][region]: logging.info(f'Processing {bucket_name} in {region} from {account_id}') is_empty, client_error = is_bucket_empty(s3_resource, bucket_name) if client_error is None: if is_empty: # Update DB status logging.info(f'{bucket_name} in {region} from {account_id} is empty') ddb.update_source_bucket_inventory_status(bucket_name, ddb.BucketInventoryStatus.bucket_is_empty) else: destination_prefix = account_id + "/" + region destination_bucket = "arn:aws:s3:::" + get_destination_bucket_name(run_id, region) inventory_id = utility.get_resource_name(run_id, 's3-inventory', 'orc') inventory_configuration_orc = { "Schedule": { "Frequency": "Daily" }, "IsEnabled": True, "Destination": { "S3BucketDestination": { "Prefix": destination_prefix, "Format": "ORC", "Bucket": destination_bucket, "AccountId": host_account_id } }, "OptionalFields": [ "Size", "LastModifiedDate", "StorageClass", "ETag", "ReplicationStatus", "IsMultipartUploaded", "EncryptionStatus", "ObjectLockMode", "ObjectLockRetainUntilDate", "ObjectLockLegalHoldStatus" ], "IncludedObjectVersions": "All", "Id": inventory_id } try: response = s3_client.put_bucket_inventory_configuration( Bucket=bucket_name, Id=inventory_id, InventoryConfiguration=inventory_configuration_orc) logging.info(f'put bucket inventory configuration response:{response}') ddb.update_source_bucket_inventory_status(bucket_name, ddb.BucketInventoryStatus.in_progress) except ClientError as e: logging.error(f'error while creating inventory configuration on {account_id}:{region}:{bucket_name}. error details:{e}')