Exemple #1
0
def does_athena_table_exist(run_id, athena_database_name, athena_table_name):
    """ Checks if an Athena table already exists

    Arguments:
        run_id {string} -- run_id for the current Step Function execution
        athena_database_name {string} -- Athena database to use for query execution
        athena_table_name {string} -- Athena table name

    Returns:
        boolean -- Flag representing if the table already exists
    """
    exists = False
    athena_client = awshelper.get_client(awshelper.ServiceName.athena)
    execution_id = None
    try:
        execution_id = run_query(run_id, athena_client,
                                 f'SHOW TABLES IN {athena_database_name}',
                                 None, True)
    except utility.S3InsightsException as e:
        logging.info('received exception while listing tables: {e}')

    if execution_id is not None:
        result = athena_client.get_query_results(QueryExecutionId=execution_id)
        for row in result['ResultSet']['Rows']:
            table_name = row['Data'][0]['VarCharValue']
            if utility.compare_strings(athena_table_name, table_name):
                exists = True
                break
    return exists
Exemple #2
0
def run_query(run_id, athena_client, query, athena_database_name,
              wait_to_finish):
    """ Run the given Athena query

    Arguments:
        run_id {string} -- run_id for the current Step Function execution
        athena_client {boto3.client} -- Boto3 Athena client
        query {string} -- Athena query to execute
        athena_database_name {string} -- Athena database to use for query execution
        wait_to_finish {boolean} -- Should method wait for the Athena query to finish?

    Raises:
        utility.S3InsightsException: when Athena query fails

    Returns:
        string -- Athena execution id
    """
    output_location = {
        'OutputLocation':
        's3://{0}/{1}'.format(
            config.DeploymentDetails.consolidated_inventory_bucket_name,
            get_s3_output_location_prefix(run_id)),
    }

    if athena_database_name is not None:
        query_response = athena_client.start_query_execution(
            QueryString=query,
            QueryExecutionContext={'Database': athena_database_name},
            ResultConfiguration=output_location)
    else:
        query_response = athena_client.start_query_execution(
            QueryString=query, ResultConfiguration=output_location)

    execution_id = query_response['QueryExecutionId']
    if wait_to_finish:
        for attempt_count in range(1, 10):
            query_status = athena_client.get_query_execution(
                QueryExecutionId=execution_id)
            query_execution_status = query_status['QueryExecution']['Status'][
                'State']
            if utility.compare_strings(query_execution_status, 'succeeded'):
                break
            elif utility.compare_strings(query_execution_status, 'failed'):
                raise utility.S3InsightsException(
                    'Athena query failed for unknown reasons')
            time.sleep(30)
    return execution_id
Exemple #3
0
def simulate(input_parameters):
    """ Simulate smoke test

    Arguments:
        input_parameters {config.S3InsightsInput} -- Input parameters for the current execution

    Raises:
        utility.S3InsightsException: If none of the source buckets can generator inventory reports
    """
    logging.info('simulating smoke test in the current environment')
    account_id = None
    region = None
    bucket_name = None
    account_ids = [
        account_config.id for account_config in input_parameters.accounts
    ]

    for account_id in account_ids:
        source_buckets_details = ddb.get_source_buckets_details(account_id)
        for ddb_bucket in source_buckets_details:
            inventory_status = ddb_bucket[ddb.TableFieldName.inventory_status]
            if utility.compare_strings(inventory_status,
                                       ddb.BucketInventoryStatus.in_progress):
                account_id = ddb_bucket[ddb.TableFieldName.account_id]
                region = ddb_bucket[ddb.TableFieldName.region]
                bucket_name = ddb_bucket[ddb.TableFieldName.sortkey]
                break
        if account_id is not None:
            break

    if bucket_name is None:
        raise utility.S3InsightsException(
            'could not find a bucket for smoke test')

    s3_client = awshelper.get_client(awshelper.ServiceName.s3)
    file_path = utility.get_file_path(
        __file__, "smoketestdata/sample_inventory_object.orc")

    s3_key = "{0}/{1}/{2}/inventorysmoketest/data/smoke_test_inventory_object.orc".format(
        account_id, region, bucket_name)
    destination_bucket_name = s3.get_destination_bucket_name(
        input_parameters.run_id, region)

    logging.info(
        f'smoke test destination_bucket_name:{destination_bucket_name} s3_key:{s3_key}'
    )
    response = s3_client.upload_file(file_path, destination_bucket_name,
                                     s3_key)
    logging.info(f'uploading a sample inventory object. response:{response}')
    s3_key = "{0}/{1}/{2}/inventorysmoketest/somedate/manifest.checksum".format(
        account_id, region, bucket_name)
    logging.info(response)
    sleep_time_in_seconds = config.ServiceParameters.smoke_test_sleep_time_in_seconds
    time.sleep(sleep_time_in_seconds)
    response = s3_client.upload_file(file_path, destination_bucket_name,
                                     s3_key)
    logging.info(f'uploading a sample manifest checksum. response:{response}')
    time.sleep(sleep_time_in_seconds)
Exemple #4
0
def process_inventory_object(event):
    """ Process an inventory object once it has been stored in a staging destination bucket

    Arguments:
        event {json} -- S3 notification event
    """
    if 'Records' in event:
        input_parameters = ddb.get_input_parameters()
        for record in event['Records']:
            if 'body' in record:
                body_json = json.loads(record['body'])
                if 'Records' in body_json:
                    for record in body_json['Records']:
                        if 'eventName' in record and record['eventName'] == 'ObjectCreated:Put':
                            source_bucket_name = record['s3']['bucket']['name']
                            if is_destination_bucket_name(input_parameters, source_bucket_name):
                                source_object_key = record['s3']['object']['key']
                                object_key_parts = source_object_key.split('/')
                                object_key_parts_len = len(object_key_parts)

                                bucket_account_id = object_key_parts[0].lower()
                                bucket_region = object_key_parts[1].lower()
                                bucket_name = object_key_parts[2].lower()
                                logging.info(f'source_object_key:{source_object_key} bucket_account_id:{bucket_account_id} bucket_name:{bucket_name}')
                                if ddb.is_inprogress_inventory_job(bucket_account_id, bucket_name):
                                    if object_key_parts_len > 4:
                                        if utility.compare_strings(object_key_parts[object_key_parts_len - 1], 'manifest.checksum'):
                                            ddb.update_source_bucket_inventory_status(object_key_parts[2], ddb.BucketInventoryStatus.done)
                                            remove_bucket_inventory_configuration(
                                                input_parameters.run_id,
                                                bucket_account_id,
                                                bucket_region, bucket_name
                                            )
                                        elif utility.compare_strings(object_key_parts[object_key_parts_len - 2], 'data'):
                                            copy_inventory_object_into_consolidation_bucket(
                                                input_parameters.run_id,
                                                source_bucket_name,
                                                source_object_key,
                                                config.DeploymentDetails.consolidated_inventory_bucket_name
                                            )
                                else:
                                    logging.warning('Received an unexpected SQS notification')
Exemple #5
0
    def is_complete(self, in_progress_state_name, complete_state_name):
        """ Check if the CloudFormation stack operation has finished

        Arguments:
            in_progress_state_name {string} -- In-progress state name
            complete_state_name {[string} -- Complete state name

        Raises:
            utility.S3InsightsException: When the stack status is neither in-progress nor complete

        Returns:
            boolean -- Flag indicating if the operation has finished successfully
        """
        details = self.client.describe_stacks(StackName=self.name)
        logging.info(f'current stack details: {self.name} {details}')
        current_status = details['Stacks'][0]['StackStatus'].lower()
        if utility.compare_strings(current_status, in_progress_state_name):
            return False
        elif utility.compare_strings(current_status, complete_state_name):
            return True
        else:
            error_message = f'unknown stack status. Stack name:{self.name} current status:{current_status} expected status:{in_progress_state_name}/{complete_state_name} details:{details}'
            raise utility.S3InsightsException(error_message)
Exemple #6
0
def is_inprogress_inventory_job(account_id, bucket_name):
    """ Check if the inventory job is in progress for a given source bucket

    Arguments:
        account_id {string} -- AWS account id
        bucket_name {string} -- Source bucket name

    Returns:
        boolean -- Flag indicating if the job is in progress
    """
    result = False
    table = get_table()
    response = table.query(KeyConditionExpression=Key(
        TableFieldName.partitionkey).eq(TableValueCategory.source_bucket)
                           & Key(TableFieldName.sortkey).eq(bucket_name))
    if 'Items' in response and len(response['Items']) == 1:
        item = response['Items'][0]
        inventory_status = item[TableFieldName.inventory_status]
        if utility.compare_strings(inventory_status,
                                   BucketInventoryStatus.in_progress):
            result = True
        elif utility.compare_strings(inventory_status,
                                     BucketInventoryStatus.done):
            if TableFieldName.change_timestamp in item:
                utc_epoch_time_decimal = item[TableFieldName.change_timestamp]
                utc_epoch_time = float(utc_epoch_time_decimal)
                inventory_completion_timestamp = datetime.fromtimestamp(
                    utc_epoch_time)
                current_timestamp = datetime.utcnow()
                delta = current_timestamp - inventory_completion_timestamp
                delta_seconds = delta.total_seconds()
                if delta_seconds < (
                        config.ServiceParameters.
                        buffer_time_after_nventory_completion_in_hours * 3600):
                    # This must be an out of order notification. Let's copy this inventory file
                    result = True
    return result
Exemple #7
0
def update_source_bucket_inventory_status(input_parameters, account_id):
    """ Update inventory status for all applicable source buckets under the given AWS account

    Arguments:
        input_parameters {config.S3InsightsInput} -- Input parameters for the current execution
        account_id {string} -- AWS account id
    """
    source_buckets = get_source_buckets(input_parameters, account_id)
    source_buckets_details = ddb.get_source_buckets_details(account_id)
    s3_resources = {}
    for ddb_bucket in source_buckets_details:
        inventory_status = ddb_bucket[ddb.TableFieldName.inventory_status]
        if utility.compare_strings(inventory_status, ddb.BucketInventoryStatus.in_progress):
            ddb_account_id = ddb_bucket[ddb.TableFieldName.account_id]
            ddb_region = ddb_bucket[ddb.TableFieldName.region]
            ddb_bucket_name = ddb_bucket[ddb.TableFieldName.sortkey]

            if not (ddb_account_id in source_buckets
                    and ddb_region in source_buckets[ddb_account_id]
                    and ddb_bucket_name in source_buckets[ddb_account_id][ddb_region]):
                logging.info(f'{ddb_bucket_name} not available anymore. updating the ddb entry')
                ddb.update_source_bucket_inventory_status(
                    ddb_bucket_name,
                    ddb.BucketInventoryStatus.bucket_not_available)
            else:
                if ddb_account_id not in s3_resources:
                    s3_resources[ddb_account_id] = awshelper.get_resource(awshelper.ServiceName.s3, ddb_account_id, input_parameters.run_id)
                s3_resource = s3_resources[ddb_account_id]
                is_empty, client_error = is_bucket_empty(s3_resource, ddb_bucket_name)
                remove_inventory_configuration = False
                if client_error is not None:
                    logging.info(f'{ddb_bucket_name} bucket access lost for some reason. updating the ddb entry')
                    ddb.update_source_bucket_inventory_status(
                        ddb_bucket[ddb.TableFieldName.sortkey],
                        ddb.BucketInventoryStatus.bucket_access_lost)
                    remove_inventory_configuration = True
                elif is_empty:
                    logging.info(f'{ddb_bucket_name} is empty. Updating the DDB entry')
                    ddb.update_source_bucket_inventory_status(
                        ddb_bucket[ddb.TableFieldName.sortkey],
                        ddb.BucketInventoryStatus.bucket_is_empty)
                    remove_inventory_configuration = True

                if remove_inventory_configuration:
                    remove_bucket_inventory_configuration(
                        input_parameters.run_id,
                        ddb_account_id,
                        ddb_region,
                        ddb_bucket_name)
Exemple #8
0
def is_destination_bucket_name(input_parameters, bucket_name):
    """ Check if the given S3 bucket is a destination bucket

    Arguments:
        input_parameters {config.S3InsightsInput} -- Input parameters for the current execution
        bucket_name {string} -- Bucket name

    Returns:
        boolean -- Flag indicating if the bucket is a destination bucket
    """
    for region in input_parameters.supported_regions:
        destination_bucket_name = get_destination_bucket_name(input_parameters.run_id, region)
        if utility.compare_strings(destination_bucket_name, bucket_name):
            return True
    return False
Exemple #9
0
def get_source_buckets(input_parameters, account_id):
    """ Get all eligible source buckets

    Arguments:
        input_parameters {config.S3InsightsInput} -- Input parameters for the current execution
        account_id {string} -- AWS account id

    Returns:
        dict<string, dict<string, list(string)>> -- Source buckets
    """
    source_buckets = {}
    account_id = account_id.lower()
    account_config = next(account_config for account_config in input_parameters.accounts if utility.compare_strings(account_config.id, account_id))
    source_buckets[account_id] = {}
    s3_client = awshelper.get_client(
        awshelper.ServiceName.s3,
        None,
        account_id,
        input_parameters.run_id)

    # Exclude the consolidation and inventory destination buckets
    pipeline_buckets = []
    if utility.compare_strings(account_id, awshelper.SessionManager.get_host_account_id()):
        pipeline_buckets.append(config.DeploymentDetails.consolidated_inventory_bucket_name)
        for region in input_parameters.supported_regions:
            bucket_name = get_destination_bucket_name(input_parameters.run_id, region)
            pipeline_buckets.append(bucket_name)
    response = s3_client.list_buckets()
    for bucket in response["Buckets"]:
        name = bucket["Name"].lower()
        if name not in account_config.exclude and name not in pipeline_buckets:
            try:
                location = s3_client.get_bucket_location(
                    Bucket=name)
                region = location['LocationConstraint']
                if region is None:
                    region = 'us-east-1'
                region = region.lower()
                if region in input_parameters.supported_regions:
                    if region not in source_buckets[account_id]:
                        source_buckets[account_id][region] = []
                    source_buckets[account_id][region].append(name)
            except ClientError as e:
                logging.error(f'error while retrieving bucket information for {account_id}:{name}. error details: {e}')

    return source_buckets
Exemple #10
0
def copy_inventory_object_into_consolidation_bucket(run_id, source_bucket_name, source_object_key, destination_bucket_name):
    """ Copy inventory object into the consolidation bucket

    Arguments:
        run_id {string} -- run_id for the current Step Function execution
        source_bucket_name {string} -- Source bucket name
        source_object_key {string} -- Source object key
        destination_bucket_name {string} -- Destination bucket name
    """
    session = awshelper.SessionManager.get_host_aws_session()
    s3_resource = session.resource('s3', config=botocore_config(s3={'use_accelerate_endpoint': True}))
    object_key_parts = source_object_key.split('/')
    object_key_parts_len = len(object_key_parts)
    new_object_key = "{0}account={1}/region={2}/bucketname={3}/{4}".format(
        get_inventory_prefix_at_consolidated_bucket(run_id),
        object_key_parts[0],
        object_key_parts[1],
        object_key_parts[2],
        object_key_parts[object_key_parts_len - 1])

    copy_source = {
        'Bucket': source_bucket_name,
        'Key': source_object_key
    }

    try:
        s3_resource.meta.client.copy(
            copy_source,
            destination_bucket_name,
            new_object_key)
    except ClientError as e:
        if 'Error' in e.response and 'Code' in e.response['Error']['Code'] and utility.compare_strings(e.response['Error']['Code'], "slowdown"):
            wait_in_seconds = random.randint(1, 120)

            # S3 is throttling upload operations. Let's back off for a few seconds
            logging.warning(f's3 is throttling copy request for {source_bucket_name}:{source_object_key}. wait time in seconds:{wait_in_seconds}')
            time.sleep(wait_in_seconds)
        raise
Exemple #11
0
def send_welcome_email():
    """ Send welcome email
    """
    queries = ddb.get_athena_queries()
    query_details_html = '''
    <html>
        <head>
            <style>
                table, th, td {
                    border: 1px solid black;
                    border-collapse: collapse;
                }
                .success {
                    background-color: rgba(0, 255, 0, 0.2);
                }
                .failed {
                    background-color: rgba(255, 0, 0, 0.2);
                }
                .neutral {
                    background-color:white;
                }
            </style>
        </head>
    <body>
        <body>
        <p>
            Your latest <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#how-to-initiate-a-state-machine-execution">S3Insights Harvester execution</a> generated this welcome email. You can learn more about the platform <a href="https://github.com/kurmiashish/S3Insights">here</a>.
        </p>
    '''
    intro_html = 'In this run, the following Athena queries were executed. You can run additional Athena queries manually by following <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/user_guide.md#running-athena-analysis-queries-manually">these instructions</a>. Please refer to the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#athena-failures">Athena troubleshooting document</a> if any of the following Athena queries have failed.'
    input_parameters = ddb.get_input_parameters()
    if input_parameters.is_smoke_test:
        intro_html = intro_html + ' <b>As this is a smoke test run, the following links may not work as the platform may have deleted the Athena resources.</b>'
    query_details_html = query_details_html + "<h4>Analysis Queries</h4><p>" + intro_html + "</p>"

    query_details_html = query_details_html + '''
    <table>
            <tr>
                <th>Name</th>
                <th>Query</th>
                <th>Status</th>
                <th>Execution Details</th>
            </tr>
    '''
    succeeded_status_value = 'succeeded'
    done_status_value = 'done'
    bucket_is_empty_status_value = 'bucket_is_empty'
    everything_else_status_value = 'everything_else'

    success_css_class_name = 'success'
    failed_css_class_name = 'failed'
    neutral_css_class_name = 'neutral'

    css_mappings = {
        succeeded_status_value: success_css_class_name,
        done_status_value: success_css_class_name,
        everything_else_status_value: failed_css_class_name,
        bucket_is_empty_status_value: neutral_css_class_name
    }

    for state in [succeeded_status_value, everything_else_status_value]:
        for query in queries:
            should_include = False
            if not utility.compare_strings(state,
                                           everything_else_status_value):
                should_include = utility.compare_strings(state, query.state)
            else:
                should_include = not utility.compare_strings(
                    succeeded_status_value, query.state)

            if should_include:
                css_class_name = css_mappings[state]
                query_web_console_link = 'https://console.aws.amazon.com/athena/home?region={0}#query/history/{1}'.format(
                    config.DeploymentDetails.region, query.query_execution_id)
                query_web_console_link_html = '<a href={0}> Web Console Link </a>'.format(
                    query_web_console_link)
                query_details_html = query_details_html + f'<tr class="{css_class_name}"><td>' + ' </td><td>'.join(
                    [
                        query.query_name, query.actual_query, query.state,
                        query_web_console_link_html
                    ]) + '</td></tr>'

    query_details_html = query_details_html + '</table><br>'
    bucket_html_table = '''
            <h4>Source buckets</h4>
            <p>
                The following buckets are included in the analysis. If the platform failed to generate inventory for any of the buckets (i.e., if any entry in the following table is highlighted in Red), please consult the <a href="https://github.com/kurmiashish/S3Insights/blob/master/docs/troubleshooting.md#inventory-generation-failures">inventory generation troubleshooting document</a>.
            </p>
            <table>
            <tr>
                <th>Account</th>
                <th>Region</th>
                <th>Bucket</th>
                <th>Inventory Status</th>
            </tr>
    '''

    source_buckets = ddb.get_source_buckets()
    for account_id in source_buckets:
        # Let's calculate the value for rowspan
        account_row_span = sum([
            len(source_buckets[account_id][region])
            for region in source_buckets[account_id]
        ])
        inserted_account_row = False
        for region in source_buckets[account_id]:
            region_row_span = len(source_buckets[account_id][region])
            inserted_region_row = False
            for inventory_status in [
                    done_status_value, bucket_is_empty_status_value,
                    everything_else_status_value
            ]:
                for bucket in source_buckets[account_id][region]:
                    should_include = False
                    if not utility.compare_strings(
                            inventory_status, everything_else_status_value):
                        should_include = utility.compare_strings(
                            inventory_status, bucket.inventory_status)
                    else:
                        already_included = utility.compare_strings(
                            done_status_value, bucket.inventory_status
                        ) or utility.compare_strings(
                            bucket_is_empty_status_value,
                            bucket.inventory_status)
                        should_include = not already_included

                    if should_include:
                        css_class_name = css_mappings[inventory_status]
                        row = "<tr>"
                        if not inserted_account_row:
                            inserted_account_row = True
                            row = row + "<td rowspan={0}>{1}</td>".format(
                                account_row_span, account_id)
                        if not inserted_region_row:
                            inserted_region_row = True
                            row = row + "<td rowspan={0}>{1}</td>".format(
                                region_row_span, region)
                        row = row + f'<td class="{css_class_name}">{bucket.name}</td>'
                        row = row + f'<td class="{css_class_name}">{bucket.inventory_status}</td></tr>'
                        bucket_html_table = bucket_html_table + row
    bucket_html_table = bucket_html_table + "</table>"
    query_details_html = query_details_html + bucket_html_table

    input_parameters_str = json.dumps(
        input_parameters,
        default=lambda input_parameters: input_parameters.__dict__,
        sort_keys=True,
        indent=4,
        separators=(',', ': '))

    input_parameters_section = '''
<br>
<h4>Input Parameters</h4>
<p>
<div style="white-space: pre-wrap;">
The execution parameters used for this run are given below.
{0}
</div>
</p>
    '''.format(input_parameters_str)
    query_details_html = query_details_html + input_parameters_section + '</body></html>'
    logging.info(f'welcome email content:{query_details_html}')

    input_parameters = ddb.get_input_parameters()
    ses_client = awshelper.get_client(awshelper.ServiceName.ses)
    response = ses_client.send_email(
        Destination={
            'ToAddresses': input_parameters.recipient_email_addresses,
        },
        Message={
            'Body': {
                'Html': {
                    'Charset': 'UTF-8',
                    'Data': query_details_html,
                },
                'Text': {
                    'Charset': 'UTF-8',
                    'Data': query_details_html,
                },
            },
            'Subject': {
                'Charset': 'UTF-8',
                'Data': 'Your S3Insights snapshot is ready',
            },
        },
        Source=input_parameters.sender_email_address,
    )
    logging.info(f'send email api response:{response}')