Ejemplo n.º 1
0
    def unique_s3_buckets_and_keys(self):
        """Filter a list of unique s3 buckets and S3 keys from event notifications

        Returns:
            (dict): Keys of bucket names, and values of unique S3 keys
        """
        s3_buckets_and_keys = defaultdict(set)

        if not self.received_messages:
            LOGGER.error(
                'No messages to filter, fetch the messages with get_messages()'
            )
            return

        for message in self.received_messages:
            if 'Body' not in message:
                LOGGER.error('Missing \'Body\' key in SQS message, skipping')
                continue

            loaded_message = json.loads(message['Body'])

            # From AWS documentation: http://amzn.to/2w4fcSq
            # When you configure an event notification on a bucket,
            # Amazon S3 sends the following test message:
            # {
            #    "Service":"Amazon S3",
            #    "Event":"s3:TestEvent",
            #    "Time":"2014-10-13T15:57:02.089Z",
            #    "Bucket":"bucketname",
            #    "RequestId":"5582815E1AEA5ADF",
            #    "HostId":"8cLeGAmw098X5cv4Zkwcmo8vvZa3eH3eKxsPzbB9wrR+YstdA6Knx4Ip8EXAMPLE"
            # }
            if loaded_message.get('Event') == 's3:TestEvent':
                LOGGER.debug('Skipping S3 bucket notification test event')
                continue

            if 'Records' not in loaded_message:
                LOGGER.error(
                    'Missing \'Records\' key in SQS message, skipping:\n%s',
                    json.dumps(loaded_message, indent=4))
                continue

            for record in loaded_message['Records']:
                if 's3' not in record:
                    LOGGER.info('Skipping non-s3 bucket notification message')
                    LOGGER.debug(record)
                    continue

                bucket_name = record['s3']['bucket']['name']
                # Account for special characters in the S3 object key
                # Example: Usage of '=' in the key name
                object_key = urllib.unquote(
                    record['s3']['object']['key']).decode('utf8')
                s3_buckets_and_keys[bucket_name].add(object_key)

                # Add to a new list to track successfully processed messages from the queue
                self.processed_messages.append(message)

        return s3_buckets_and_keys
Ejemplo n.º 2
0
    def run_athena_query(self, **kwargs):
        """Helper function to run Athena queries

        Keyword Args:
            query (str): The SQL query to execute
            database (str): The database context to execute the query in
            async (bool): If the function should asynchronously run queries
                without backing off until completion.

        Returns:
            bool, dict: query success, query result response
        """
        LOGGER.debug('Executing query: %s', kwargs['query'])
        query_execution_resp = self.athena_client.start_query_execution(
            QueryString=kwargs['query'],
            QueryExecutionContext={
                'Database': kwargs.get('database', self.DATABASE_DEFAULT)
            },
            ResultConfiguration={
                'OutputLocation':
                '{}/{}'.format(self.athena_results_bucket,
                               self.athena_results_key)
            })

        # If asynchronous invocation is enabled, and a valid query
        # execution ID was returned.
        if kwargs.get('async') and query_execution_resp.get(
                'QueryExecutionId'):
            return True, query_execution_resp

        exeuction_id = query_execution_resp['QueryExecutionId']
        query_execution_result = self.check_query_status(exeuction_id)

        state = query_execution_result['QueryExecution']['Status']['State']

        if state != 'SUCCEEDED':
            reason = query_execution_result['QueryExecution']['Status'][
                'StateChangeReason']
            LOGGER.error('Query %s %s with reason %s, exiting!', exeuction_id,
                         state, reason)
            LOGGER.error('Full query:\n%s', kwargs['query'])
            return False, {}

        query_results_resp = self.athena_client.get_query_results(
            QueryExecutionId=exeuction_id, )

        # The idea here is to leave the processing logic to the calling functions.
        # No data being returned isn't always an indication that something is wrong.
        # When handling the query result data, iterate over each element in the Row,
        # and parse the Data key.
        # Reference: https://bit.ly/2tWOQ2N
        if not query_results_resp['ResultSet']['Rows']:
            LOGGER.debug('The query %s returned empty rows of data',
                         kwargs['query'])

        return True, query_results_resp
Ejemplo n.º 3
0
def _giveup_handler(details):
    """Backoff logging handler for when backoff gives up.

    Args:
        details (dict): Backoff context containing the number of tries,
            target function currently executing, kwargs, args, value,
            and wait time.
    """
    LOGGER.debug('[Backoff]: Exiting after %d tries calling %s',
                 details['tries'], details['target'].__name__)
Ejemplo n.º 4
0
def _backoff_handler(details):
    """Backoff logging handler for when polling occurs.

    Args:
        details (dict): Backoff context containing the number of tries,
            target function currently executing, kwargs, args, value,
            and wait time.
    """
    LOGGER.debug(
        '[Backoff]: Trying again in %f seconds after %d tries calling %s',
        details['wait'], details['tries'], details['target'].__name__)
Ejemplo n.º 5
0
    def run(self, event):
        """Take the messages from the SQS queue and create partitions for new data in S3

        Args:
            event (dict): Lambda input event containing SQS messages. Each SQS message
                should contain one (or maybe more) S3 bucket notification message.
        """
        # Check that the database being used exists before running queries
        if not self._athena_client.check_database_exists():
            raise AthenaRefreshError(
                'The \'{}\' database does not exist'.format(
                    self._athena_client.database))

        for sqs_rec in event['Records']:
            LOGGER.debug(
                'Processing event with message ID \'%s\' and SentTimestamp %s',
                sqs_rec['messageId'], sqs_rec['attributes']['SentTimestamp'])

            body = json.loads(sqs_rec['body'])
            if body.get('Event') == 's3:TestEvent':
                LOGGER.debug('Skipping S3 bucket notification test event')
                continue

            for s3_rec in body['Records']:
                if 's3' not in s3_rec:
                    LOGGER.info(
                        'Skipping non-s3 bucket notification message: %s',
                        s3_rec)
                    continue

                bucket_name = s3_rec['s3']['bucket']['name']

                # Account for special characters in the S3 object key
                # Example: Usage of '=' in the key name
                object_key = urllib.unquote_plus(
                    s3_rec['s3']['object']['key']).decode('utf8')

                LOGGER.debug(
                    'Received notification for object \'%s\' in bucket \'%s\'',
                    object_key, bucket_name)

                self._s3_buckets_and_keys[bucket_name].add(object_key)

        if not self._add_partitions():
            raise AthenaRefreshError('Failed to add partitions: {}'.format(
                dict(self._s3_buckets_and_keys)))