def does_athena_table_exist(run_id, athena_database_name, athena_table_name): """ Checks if an Athena table already exists Arguments: run_id {string} -- run_id for the current Step Function execution athena_database_name {string} -- Athena database to use for query execution athena_table_name {string} -- Athena table name Returns: boolean -- Flag representing if the table already exists """ exists = False athena_client = awshelper.get_client(awshelper.ServiceName.athena) execution_id = None try: execution_id = run_query(run_id, athena_client, f'SHOW TABLES IN {athena_database_name}', None, True) except utility.S3InsightsException as e: logging.info('received exception while listing tables: {e}') if execution_id is not None: result = athena_client.get_query_results(QueryExecutionId=execution_id) for row in result['ResultSet']['Rows']: table_name = row['Data'][0]['VarCharValue'] if utility.compare_strings(athena_table_name, table_name): exists = True break return exists
def run_query(run_id, athena_client, query, athena_database_name, wait_to_finish): """ Run the given Athena query Arguments: run_id {string} -- run_id for the current Step Function execution athena_client {boto3.client} -- Boto3 Athena client query {string} -- Athena query to execute athena_database_name {string} -- Athena database to use for query execution wait_to_finish {boolean} -- Should method wait for the Athena query to finish? Raises: utility.S3InsightsException: when Athena query fails Returns: string -- Athena execution id """ output_location = { 'OutputLocation': 's3://{0}/{1}'.format( config.DeploymentDetails.consolidated_inventory_bucket_name, get_s3_output_location_prefix(run_id)), } if athena_database_name is not None: query_response = athena_client.start_query_execution( QueryString=query, QueryExecutionContext={'Database': athena_database_name}, ResultConfiguration=output_location) else: query_response = athena_client.start_query_execution( QueryString=query, ResultConfiguration=output_location) execution_id = query_response['QueryExecutionId'] if wait_to_finish: for attempt_count in range(1, 10): query_status = athena_client.get_query_execution( QueryExecutionId=execution_id) query_execution_status = query_status['QueryExecution']['Status'][ 'State'] if utility.compare_strings(query_execution_status, 'succeeded'): break elif utility.compare_strings(query_execution_status, 'failed'): raise utility.S3InsightsException( 'Athena query failed for unknown reasons') time.sleep(30) return execution_id
def simulate(input_parameters): """ Simulate smoke test Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution Raises: utility.S3InsightsException: If none of the source buckets can generator inventory reports """ logging.info('simulating smoke test in the current environment') account_id = None region = None bucket_name = None account_ids = [ account_config.id for account_config in input_parameters.accounts ] for account_id in account_ids: source_buckets_details = ddb.get_source_buckets_details(account_id) for ddb_bucket in source_buckets_details: inventory_status = ddb_bucket[ddb.TableFieldName.inventory_status] if utility.compare_strings(inventory_status, ddb.BucketInventoryStatus.in_progress): account_id = ddb_bucket[ddb.TableFieldName.account_id] region = ddb_bucket[ddb.TableFieldName.region] bucket_name = ddb_bucket[ddb.TableFieldName.sortkey] break if account_id is not None: break if bucket_name is None: raise utility.S3InsightsException( 'could not find a bucket for smoke test') s3_client = awshelper.get_client(awshelper.ServiceName.s3) file_path = utility.get_file_path( __file__, "smoketestdata/sample_inventory_object.orc") s3_key = "{0}/{1}/{2}/inventorysmoketest/data/smoke_test_inventory_object.orc".format( account_id, region, bucket_name) destination_bucket_name = s3.get_destination_bucket_name( input_parameters.run_id, region) logging.info( f'smoke test destination_bucket_name:{destination_bucket_name} s3_key:{s3_key}' ) response = s3_client.upload_file(file_path, destination_bucket_name, s3_key) logging.info(f'uploading a sample inventory object. response:{response}') s3_key = "{0}/{1}/{2}/inventorysmoketest/somedate/manifest.checksum".format( account_id, region, bucket_name) logging.info(response) sleep_time_in_seconds = config.ServiceParameters.smoke_test_sleep_time_in_seconds time.sleep(sleep_time_in_seconds) response = s3_client.upload_file(file_path, destination_bucket_name, s3_key) logging.info(f'uploading a sample manifest checksum. response:{response}') time.sleep(sleep_time_in_seconds)
def process_inventory_object(event): """ Process an inventory object once it has been stored in a staging destination bucket Arguments: event {json} -- S3 notification event """ if 'Records' in event: input_parameters = ddb.get_input_parameters() for record in event['Records']: if 'body' in record: body_json = json.loads(record['body']) if 'Records' in body_json: for record in body_json['Records']: if 'eventName' in record and record['eventName'] == 'ObjectCreated:Put': source_bucket_name = record['s3']['bucket']['name'] if is_destination_bucket_name(input_parameters, source_bucket_name): source_object_key = record['s3']['object']['key'] object_key_parts = source_object_key.split('/') object_key_parts_len = len(object_key_parts) bucket_account_id = object_key_parts[0].lower() bucket_region = object_key_parts[1].lower() bucket_name = object_key_parts[2].lower() logging.info(f'source_object_key:{source_object_key} bucket_account_id:{bucket_account_id} bucket_name:{bucket_name}') if ddb.is_inprogress_inventory_job(bucket_account_id, bucket_name): if object_key_parts_len > 4: if utility.compare_strings(object_key_parts[object_key_parts_len - 1], 'manifest.checksum'): ddb.update_source_bucket_inventory_status(object_key_parts[2], ddb.BucketInventoryStatus.done) remove_bucket_inventory_configuration( input_parameters.run_id, bucket_account_id, bucket_region, bucket_name ) elif utility.compare_strings(object_key_parts[object_key_parts_len - 2], 'data'): copy_inventory_object_into_consolidation_bucket( input_parameters.run_id, source_bucket_name, source_object_key, config.DeploymentDetails.consolidated_inventory_bucket_name ) else: logging.warning('Received an unexpected SQS notification')
def is_complete(self, in_progress_state_name, complete_state_name): """ Check if the CloudFormation stack operation has finished Arguments: in_progress_state_name {string} -- In-progress state name complete_state_name {[string} -- Complete state name Raises: utility.S3InsightsException: When the stack status is neither in-progress nor complete Returns: boolean -- Flag indicating if the operation has finished successfully """ details = self.client.describe_stacks(StackName=self.name) logging.info(f'current stack details: {self.name} {details}') current_status = details['Stacks'][0]['StackStatus'].lower() if utility.compare_strings(current_status, in_progress_state_name): return False elif utility.compare_strings(current_status, complete_state_name): return True else: error_message = f'unknown stack status. Stack name:{self.name} current status:{current_status} expected status:{in_progress_state_name}/{complete_state_name} details:{details}' raise utility.S3InsightsException(error_message)
def is_inprogress_inventory_job(account_id, bucket_name): """ Check if the inventory job is in progress for a given source bucket Arguments: account_id {string} -- AWS account id bucket_name {string} -- Source bucket name Returns: boolean -- Flag indicating if the job is in progress """ result = False table = get_table() response = table.query(KeyConditionExpression=Key( TableFieldName.partitionkey).eq(TableValueCategory.source_bucket) & Key(TableFieldName.sortkey).eq(bucket_name)) if 'Items' in response and len(response['Items']) == 1: item = response['Items'][0] inventory_status = item[TableFieldName.inventory_status] if utility.compare_strings(inventory_status, BucketInventoryStatus.in_progress): result = True elif utility.compare_strings(inventory_status, BucketInventoryStatus.done): if TableFieldName.change_timestamp in item: utc_epoch_time_decimal = item[TableFieldName.change_timestamp] utc_epoch_time = float(utc_epoch_time_decimal) inventory_completion_timestamp = datetime.fromtimestamp( utc_epoch_time) current_timestamp = datetime.utcnow() delta = current_timestamp - inventory_completion_timestamp delta_seconds = delta.total_seconds() if delta_seconds < ( config.ServiceParameters. buffer_time_after_nventory_completion_in_hours * 3600): # This must be an out of order notification. Let's copy this inventory file result = True return result
def update_source_bucket_inventory_status(input_parameters, account_id): """ Update inventory status for all applicable source buckets under the given AWS account Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution account_id {string} -- AWS account id """ source_buckets = get_source_buckets(input_parameters, account_id) source_buckets_details = ddb.get_source_buckets_details(account_id) s3_resources = {} for ddb_bucket in source_buckets_details: inventory_status = ddb_bucket[ddb.TableFieldName.inventory_status] if utility.compare_strings(inventory_status, ddb.BucketInventoryStatus.in_progress): ddb_account_id = ddb_bucket[ddb.TableFieldName.account_id] ddb_region = ddb_bucket[ddb.TableFieldName.region] ddb_bucket_name = ddb_bucket[ddb.TableFieldName.sortkey] if not (ddb_account_id in source_buckets and ddb_region in source_buckets[ddb_account_id] and ddb_bucket_name in source_buckets[ddb_account_id][ddb_region]): logging.info(f'{ddb_bucket_name} not available anymore. updating the ddb entry') ddb.update_source_bucket_inventory_status( ddb_bucket_name, ddb.BucketInventoryStatus.bucket_not_available) else: if ddb_account_id not in s3_resources: s3_resources[ddb_account_id] = awshelper.get_resource(awshelper.ServiceName.s3, ddb_account_id, input_parameters.run_id) s3_resource = s3_resources[ddb_account_id] is_empty, client_error = is_bucket_empty(s3_resource, ddb_bucket_name) remove_inventory_configuration = False if client_error is not None: logging.info(f'{ddb_bucket_name} bucket access lost for some reason. updating the ddb entry') ddb.update_source_bucket_inventory_status( ddb_bucket[ddb.TableFieldName.sortkey], ddb.BucketInventoryStatus.bucket_access_lost) remove_inventory_configuration = True elif is_empty: logging.info(f'{ddb_bucket_name} is empty. Updating the DDB entry') ddb.update_source_bucket_inventory_status( ddb_bucket[ddb.TableFieldName.sortkey], ddb.BucketInventoryStatus.bucket_is_empty) remove_inventory_configuration = True if remove_inventory_configuration: remove_bucket_inventory_configuration( input_parameters.run_id, ddb_account_id, ddb_region, ddb_bucket_name)
def is_destination_bucket_name(input_parameters, bucket_name): """ Check if the given S3 bucket is a destination bucket Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution bucket_name {string} -- Bucket name Returns: boolean -- Flag indicating if the bucket is a destination bucket """ for region in input_parameters.supported_regions: destination_bucket_name = get_destination_bucket_name(input_parameters.run_id, region) if utility.compare_strings(destination_bucket_name, bucket_name): return True return False
def get_source_buckets(input_parameters, account_id): """ Get all eligible source buckets Arguments: input_parameters {config.S3InsightsInput} -- Input parameters for the current execution account_id {string} -- AWS account id Returns: dict<string, dict<string, list(string)>> -- Source buckets """ source_buckets = {} account_id = account_id.lower() account_config = next(account_config for account_config in input_parameters.accounts if utility.compare_strings(account_config.id, account_id)) source_buckets[account_id] = {} s3_client = awshelper.get_client( awshelper.ServiceName.s3, None, account_id, input_parameters.run_id) # Exclude the consolidation and inventory destination buckets pipeline_buckets = [] if utility.compare_strings(account_id, awshelper.SessionManager.get_host_account_id()): pipeline_buckets.append(config.DeploymentDetails.consolidated_inventory_bucket_name) for region in input_parameters.supported_regions: bucket_name = get_destination_bucket_name(input_parameters.run_id, region) pipeline_buckets.append(bucket_name) response = s3_client.list_buckets() for bucket in response["Buckets"]: name = bucket["Name"].lower() if name not in account_config.exclude and name not in pipeline_buckets: try: location = s3_client.get_bucket_location( Bucket=name) region = location['LocationConstraint'] if region is None: region = 'us-east-1' region = region.lower() if region in input_parameters.supported_regions: if region not in source_buckets[account_id]: source_buckets[account_id][region] = [] source_buckets[account_id][region].append(name) except ClientError as e: logging.error(f'error while retrieving bucket information for {account_id}:{name}. error details: {e}') return source_buckets
def copy_inventory_object_into_consolidation_bucket(run_id, source_bucket_name, source_object_key, destination_bucket_name): """ Copy inventory object into the consolidation bucket Arguments: run_id {string} -- run_id for the current Step Function execution source_bucket_name {string} -- Source bucket name source_object_key {string} -- Source object key destination_bucket_name {string} -- Destination bucket name """ session = awshelper.SessionManager.get_host_aws_session() s3_resource = session.resource('s3', config=botocore_config(s3={'use_accelerate_endpoint': True})) object_key_parts = source_object_key.split('/') object_key_parts_len = len(object_key_parts) new_object_key = "{0}account={1}/region={2}/bucketname={3}/{4}".format( get_inventory_prefix_at_consolidated_bucket(run_id), object_key_parts[0], object_key_parts[1], object_key_parts[2], object_key_parts[object_key_parts_len - 1]) copy_source = { 'Bucket': source_bucket_name, 'Key': source_object_key } try: s3_resource.meta.client.copy( copy_source, destination_bucket_name, new_object_key) except ClientError as e: if 'Error' in e.response and 'Code' in e.response['Error']['Code'] and utility.compare_strings(e.response['Error']['Code'], "slowdown"): wait_in_seconds = random.randint(1, 120) # S3 is throttling upload operations. Let's back off for a few seconds logging.warning(f's3 is throttling copy request for {source_bucket_name}:{source_object_key}. wait time in seconds:{wait_in_seconds}') time.sleep(wait_in_seconds) raise
def send_welcome_email(): """ Send welcome email """ queries = ddb.get_athena_queries() query_details_html = ''' <html> <head> <style> table, th, td { border: 1px solid black; border-collapse: collapse; } .success { background-color: rgba(0, 255, 0, 0.2); } .failed { background-color: rgba(255, 0, 0, 0.2); } .neutral { background-color:white; } </style> </head> <body> <body> <p> Your latest <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#how-to-initiate-a-state-machine-execution">S3Insights Harvester execution</a> generated this welcome email. You can learn more about the platform <a href="https://github.com/kurmiashish/S3Insights">here</a>. </p> ''' intro_html = 'In this run, the following Athena queries were executed. You can run additional Athena queries manually by following <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#running-athena-analysis-queries-manually">these instructions</a>. Please refer to the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#athena-failures">Athena troubleshooting document</a> if any of the following Athena queries have failed.' input_parameters = ddb.get_input_parameters() if input_parameters.is_smoke_test: intro_html = intro_html + ' <b>As this is a smoke test run, the following links may not work as the platform may have deleted the Athena resources.</b>' query_details_html = query_details_html + "<h4>Analysis Queries</h4><p>" + intro_html + "</p>" query_details_html = query_details_html + ''' <table> <tr> <th>Name</th> <th>Query</th> <th>Status</th> <th>Execution Details</th> </tr> ''' succeeded_status_value = 'succeeded' done_status_value = 'done' bucket_is_empty_status_value = 'bucket_is_empty' everything_else_status_value = 'everything_else' success_css_class_name = 'success' failed_css_class_name = 'failed' neutral_css_class_name = 'neutral' css_mappings = { succeeded_status_value: success_css_class_name, done_status_value: success_css_class_name, everything_else_status_value: failed_css_class_name, bucket_is_empty_status_value: neutral_css_class_name } for state in [succeeded_status_value, everything_else_status_value]: for query in queries: should_include = False if not utility.compare_strings(state, everything_else_status_value): should_include = utility.compare_strings(state, query.state) else: should_include = not utility.compare_strings( succeeded_status_value, query.state) if should_include: css_class_name = css_mappings[state] query_web_console_link = 'https://console.aws.amazon.com/athena/home?region={0}#query/history/{1}'.format( config.DeploymentDetails.region, query.query_execution_id) query_web_console_link_html = '<a href={0}> Web Console Link </a>'.format( query_web_console_link) query_details_html = query_details_html + f'<tr class="{css_class_name}"><td>' + ' </td><td>'.join( [ query.query_name, query.actual_query, query.state, query_web_console_link_html ]) + '</td></tr>' query_details_html = query_details_html + '</table><br>' bucket_html_table = ''' <h4>Source buckets</h4> <p> The following buckets are included in the analysis. If the platform failed to generate inventory for any of the buckets (i.e., if any entry in the following table is highlighted in Red), please consult the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#inventory-generation-failures">inventory generation troubleshooting document</a>. </p> <table> <tr> <th>Account</th> <th>Region</th> <th>Bucket</th> <th>Inventory Status</th> </tr> ''' source_buckets = ddb.get_source_buckets() for account_id in source_buckets: # Let's calculate the value for rowspan account_row_span = sum([ len(source_buckets[account_id][region]) for region in source_buckets[account_id] ]) inserted_account_row = False for region in source_buckets[account_id]: region_row_span = len(source_buckets[account_id][region]) inserted_region_row = False for inventory_status in [ done_status_value, bucket_is_empty_status_value, everything_else_status_value ]: for bucket in source_buckets[account_id][region]: should_include = False if not utility.compare_strings( inventory_status, everything_else_status_value): should_include = utility.compare_strings( inventory_status, bucket.inventory_status) else: already_included = utility.compare_strings( done_status_value, bucket.inventory_status ) or utility.compare_strings( bucket_is_empty_status_value, bucket.inventory_status) should_include = not already_included if should_include: css_class_name = css_mappings[inventory_status] row = "<tr>" if not inserted_account_row: inserted_account_row = True row = row + "<td rowspan={0}>{1}</td>".format( account_row_span, account_id) if not inserted_region_row: inserted_region_row = True row = row + "<td rowspan={0}>{1}</td>".format( region_row_span, region) row = row + f'<td class="{css_class_name}">{bucket.name}</td>' row = row + f'<td class="{css_class_name}">{bucket.inventory_status}</td></tr>' bucket_html_table = bucket_html_table + row bucket_html_table = bucket_html_table + "</table>" query_details_html = query_details_html + bucket_html_table input_parameters_str = json.dumps( input_parameters, default=lambda input_parameters: input_parameters.__dict__, sort_keys=True, indent=4, separators=(',', ': ')) input_parameters_section = ''' <br> <h4>Input Parameters</h4> <p> <div style="white-space: pre-wrap;"> The execution parameters used for this run are given below. {0} </div> </p> '''.format(input_parameters_str) query_details_html = query_details_html + input_parameters_section + '</body></html>' logging.info(f'welcome email content:{query_details_html}') input_parameters = ddb.get_input_parameters() ses_client = awshelper.get_client(awshelper.ServiceName.ses) response = ses_client.send_email( Destination={ 'ToAddresses': input_parameters.recipient_email_addresses, }, Message={ 'Body': { 'Html': { 'Charset': 'UTF-8', 'Data': query_details_html, }, 'Text': { 'Charset': 'UTF-8', 'Data': query_details_html, }, }, 'Subject': { 'Charset': 'UTF-8', 'Data': 'Your S3Insights snapshot is ready', }, }, Source=input_parameters.sender_email_address, ) logging.info(f'send email api response:{response}')