Python S3Configuration Examples, datalake_library.configuration.resource_configs.S3Configuration Python Examples

Example #1

0

Show file

File: lambda_function.py Project: emarenas/aws-serverless-data-lake-framework

def lambda_handler(event, context):
    """Checks if any items need processing and triggers state machine
    Arguments:
        event {dict} -- Dictionary with no relevant details
        context {dict} -- Dictionary with details on Lambda context
    """

    # TODO Implement Redrive Logic (through message_group_id)
    try:
        logger.info('Received recent messages: {} '.format(event))
        for record in event['Records']:
            record_body = json.loads(record['body'])
            team = record_body['team']
            pipeline = record_body['pipeline']
            stage = record_body['pipeline_stage']
            dataset = record_body['dataset']
            org = record_body['org']
            app = record_body['app']
            env = record_body['env']
            dest_table = record_body['dest_table']['name']
            stage_bucket = S3Configuration().stage_bucket
            record_body['bucket'] = stage_bucket
            record_body['keysToProcess'] = record_body['prev_stage_processed_keys']

            response = {
                'statusCode': 200,
                'body': record_body
                }
            precision = 3
            seconds = f'{time.time():.{precision}f}'
            state_machine_name = f'{dataset}-{dest_table}-{seconds}'
            logger.info('Starting State Machine Execution')
            state_config = StateMachineConfiguration(team, pipeline, stage)
            StatesInterface().run_state_machine(
                state_config.get_stage_state_machine_arn, response, state_machine_name)
    except Exception as e:
        # # If failure send to DLQ
        # if keys_to_process:
        #     dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
        #     dlq_interface.send_message_to_fifo_queue(
        #         json.dumps(response), 'failed')
        logger.error("Fatal error", exc_info=True)
        raise e
    return

Example #2

0

Show file

def lambda_handler(event, context):
    """Checks if any items need processing and triggers state machine
    Arguments:
        event {dict} -- Dictionary with no relevant details
        context {dict} -- Dictionary with details on Lambda context 
    """

    # TODO Implement Redrive Logic (through message_group_id)
    try:
        team = event['team']
        pipeline = event['pipeline']
        stage = event['pipeline_stage']
        dataset = event['dataset']
        org = event['org']
        app = event['app']
        env = event['env']
        stage_bucket = S3Configuration().stage_bucket

        sqs_config = SQSConfiguration(team, dataset, stage)

        #Workload management changes
        #---------------------------
        #---------------------------

        max_sfn_executions = 3

        state_config = StateMachineConfiguration(team, pipeline, stage)
        STEPFUNCTION_ARN = state_config.get_stage_state_machine_arn
        #SFN processing code
        #---------------------------------
        executions = StatesInterface().list_state_executions(
            STEPFUNCTION_ARN, 'RUNNING', 50)

        current_sfn_exeuction = len(executions)
        logger.info(f"current_sfn_exeuctions:{current_sfn_exeuction}")

        if (current_sfn_exeuction < max_sfn_executions):
            sfn_available_slots = max_sfn_executions - current_sfn_exeuction
            logger.info(f"sfn_available_slots:{sfn_available_slots}")
        else:
            logger.info("No step function slot empty ----- exiting")
            return
        #-----------------------------------

        keys_to_process = []
        #Dynamically manging workload from different priority queues
        #------------------------------------
        for priority in ["HIGH", "LOW"]:  #high=10 #low=30 sfn slots=40
            print(priority)
            sqs_config = SQSConfiguration(team, dataset, stage, priority)
            queue_interface = SQSInterface(sqs_config.get_stage_queue_name_wlm)

            message_queue = queue_interface._message_queue
            num_messages_queue = int(
                message_queue.attributes['ApproximateNumberOfMessages'])
            logger.info(
                f"Number of messages in {message_queue}:{num_messages_queue}")

            if (num_messages_queue == 0):
                logger.info(
                    f"Not enough messages in {message_queue}, trying next prority queue"
                )
            else:
                if (num_messages_queue >= sfn_available_slots
                    ):  #Example  40>20 12>8 for high priority sqs
                    num_messages_queue = sfn_available_slots

                keys_to_process = queue_interface.wlm_receive_max_messages(
                    keys_to_process, num_messages_queue)
                logger.info(f"messages:{keys_to_process}")
                sfn_available_slots = sfn_available_slots - num_messages_queue
                logger.info(f"sfn_available_slots:{sfn_available_slots}")
            if (sfn_available_slots == 0):
                break
        #-------------------------------------

        #Running step function for processed messages
        #--------------------------------------
        if (len(keys_to_process) > 0):
            for key in keys_to_process:
                response = {
                    'statusCode': 200,
                    'body': {
                        "bucket": stage_bucket,
                        "keysToProcess": [key],
                        "team": team,
                        "pipeline": pipeline,
                        "pipeline_stage": stage,
                        "dataset": dataset,
                        "org": org,
                        "app": app,
                        "env": env
                    }
                }
                StatesInterface().run_state_machine(
                    state_config.get_stage_state_machine_arn, response)
            logger.info(
                f"{len(keys_to_process)} messages sent to step function ")
        else:
            logger.info(f"Not enough messages in any queue --- exiting")
        #----------------------------------------
        #---------------------------------------

    except Exception as e:
        # If failure send to DLQ
        if keys_to_process:
            for key in keys_to_process:
                dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
                dlq_interface.send_message_to_fifo_queue(
                    json.dumps(key), 'failed')
        logger.error("Fatal error", exc_info=True)
        raise e
    return

Example #3

0

Show file

File: light_transform_athena_ctas.py Project: emarenas/aws-serverless-data-lake-framework

import sys
import logging
import traceback
import string
import random
#######################################################
# Use S3 Interface to interact with S3 objects
# For example to download/upload them
#######################################################
from datalake_library.commons import init_logger
from datalake_library.configuration.resource_configs import S3Configuration, KMSConfiguration
from datalake_library.interfaces.s3_interface import S3Interface

s3_interface = S3Interface()
# IMPORTANT: Stage bucket where transformed data must be uploaded
stage_bucket = S3Configuration().stage_bucket
athena_client = boto3.client('athena')
glue_client = boto3.client('glue')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


class CustomTransform():
    def __init__(self):
        logger.info("Athena Light Transform initiated")

    def transform_object(self, bucket, body, team, dataset):

        # returns table path, or table path with partition name
        # example if table has no partition
        # full_table_path = pre-stage/team/dataset/TABLE_NAME

Example #4

0

Show file

File: lambda_function.py Project: joelee/aws-serverless-data-lake-framework

def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    try:
        logger.info('Fetching event data from previous step')
        processed_keys = event['body']['processedKeys']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info('Storing metadata to DynamoDB')
        bucket = S3Configuration().stage_bucket
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'size': S3Interface().get_size(bucket, key),
                'last_modified_date':
                S3Interface().get_last_modified(bucket, key),
                'org': event['body']['org'],
                'app': event['body']['app'],
                'env': event['body']['env'],
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset,
                'stage': 'stage',
                'pipeline_stage': stage,
                'peh_id': peh_id
            }

            dynamo_interface.update_object_metadata_catalog(object_metadata)

        #Workload management changes
        #---------------------------
        wlm_ddb_table = dynamo_interface.wlm_control_table
        item = dynamo_interface.get_item(
            wlm_ddb_table, {
                "name":
                "{}-{}-{}".format(team, dataset,
                                  processed_keys[0].split("/")[-2])
            })
        priority = item.get('priority', None)
        print(priority)
        #---------------------------

        logger.info('Sending messages to next SQS queue if it exists')
        sqs_config = SQSConfiguration(team, dataset, ''.join(
            [stage[:-1], chr(ord(stage[-1]) + 1)]),
                                      priority)  #Workload management changes
        sqs_interface = SQSInterface(
            sqs_config.get_stage_queue_name_wlm)  #Workload management changes
        sqs_interface.send_batch_messages_to_fifo_queue(
            processed_keys, 10, '{}-{}'.format(team, dataset))

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return 200

Example #5

0

Show file

def lambda_handler(event, context):
    """Checks if any items need processing and triggers state machine
    Arguments:
        event {dict} -- Dictionary with no relevant details
        context {dict} -- Dictionary with details on Lambda context 
    """

    # TODO Implement Redrive Logic (through message_group_id)
    try:
        team = event['team']
        pipeline = event['pipeline']
        stage = event['pipeline_stage']
        dataset = event['dataset']
        org = event['org']
        app = event['app']
        env = event['env']
        stage_bucket = S3Configuration().stage_bucket
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)
        transform_info = dynamo_interface.get_transform_table_item(
            '{}-{}'.format(team, dataset))
        MIN_ITEMS_TO_PROCESS = int(
            transform_info['min_items_process']['stage_{}'.format(
                stage[-1].lower())])
        MAX_ITEMS_TO_PROCESS = int(
            transform_info['max_items_process']['stage_{}'.format(
                stage[-1].lower())])
        sqs_config = SQSConfiguration(team, dataset, stage)
        queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
        keys_to_process = []

        logger.info('Querying {}-{} objects waiting for processing'.format(
            team, dataset))
        keys_to_process = queue_interface.receive_min_max_messages(
            MIN_ITEMS_TO_PROCESS, MAX_ITEMS_TO_PROCESS)
        # If no keys to process, break
        if not keys_to_process:
            return

        logger.info('{} Objects ready for processing'.format(
            len(keys_to_process)))
        keys_to_process = list(set(keys_to_process))

        response = {
            'statusCode': 200,
            'body': {
                "bucket": stage_bucket,
                "keysToProcess": keys_to_process,
                "team": team,
                "pipeline": pipeline,
                "pipeline_stage": stage,
                "dataset": dataset,
                "org": org,
                "app": app,
                "env": env
            }
        }
        logger.info('Starting State Machine Execution')
        state_config = StateMachineConfiguration(team, pipeline, stage)
        StatesInterface().run_state_machine(
            state_config.get_stage_state_machine_arn, response)
    except Exception as e:
        # If failure send to DLQ
        if keys_to_process:
            dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
            dlq_interface.send_message_to_fifo_queue(json.dumps(response),
                                                     'failed')
        logger.error("Fatal error", exc_info=True)
        raise e
    return

Example #6

0

Show file

def lambda_handler(event, context):
    """ Load Datafile metadata in manifests control table
        Check if manifest file is available within the threshold
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    s3_interface = S3Interface()
    stage_bucket = S3Configuration().stage_bucket

    dynamo_config = DynamoConfiguration()
    dynamo_interface = DynamoInterface(dynamo_config)
    current_time = dt.datetime.utcnow()
    current_timestamp = current_time.timestamp()

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        bucket = event['body']['bucket']
        input_file_key = event['body']['key']
        input_file_name = input_file_key.split("/")[-1]
        manifest_file_pattern = event['body']['manifest_details'][
            'regex_pattern']
        manifest_timeout = int(
            event['body']['manifest_details']['manifest_timeout'])

        if 'manifest_interval' in event['body']:
            manifest_interval = event['body']['manifest_interval']
        else:
            manifest_interval = current_timestamp

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(env).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)

        ### List S3 Objects for the manifest file in the manifest prefix
        ### For this to work the manifest should have been loaded into DynamoDB

        manifest_key = "pre-stage/{}/manifests/{}/".format(team, dataset)
        processed_manifest_keys = s3_interface.list_objects(
            stage_bucket, manifest_key)

        matched_keys = []
        items = []

        if not processed_manifest_keys:
            logger.info(
                "Manifest File has not been loaded, sleeping for 5 mins")
            time.sleep(300)
            manifest_file_loaded = "False"

        else:
            for manifest_file_key in processed_manifest_keys:
                manifest_file_name = manifest_file_key.split("/")[-1]
                match = re.match(manifest_file_pattern, manifest_file_name)
                if match:
                    matched_keys.append(manifest_file_name)

                ### Query Manifests Control table
                for keys in matched_keys:
                    dataset_name = team + "-" + dataset
                    try:
                        items.append(
                            dynamo_interface.
                            get_item_from_manifests_control_table(
                                dataset_name, keys, input_file_name))
                    except KeyError:
                        logger.info(
                            "Manifest File has not been loaded, sleeping for 5 mins"
                        )
                        manifest_file_loaded = "False"

                ### Update Manifests Control table

                if not items:
                    logger.info(
                        "Manifest File has not been loaded, sleeping for 5 mins"
                    )
                    time.sleep(300)
                    manifest_file_loaded = "False"
                else:
                    ddb_key = {
                        'dataset_name': items[0]['dataset_name'],
                        'datafile_name': items[0]['datafile_name']
                    }
                    STATUS = "STARTED"
                    dynamo_interface.update_manifests_control_table_stagea(
                        ddb_key, STATUS)
                    manifest_file_loaded = "True"
                    event['body']['manifest_ddb_key'] = ddb_key

        ### Check if Manifest threshold has exceeded

        if current_timestamp == manifest_interval:
            current_timestamp = dt.datetime.utcnow().timestamp()

        if int(
            (current_timestamp - manifest_interval) / 60) >= manifest_timeout:
            logger.error("Manifest Threshold Breached")
            raise Exception("Manifest Threshold Breached")

        event['body']['manifest_interval'] = manifest_interval
        event['body']['manifest_file_loaded'] = manifest_file_loaded

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return event

Example #7

0

Show file

File: lambda_function.py Project: yihwen/aws-serverless-data-lake-framework

def lambda_handler(event, context):
    """ Process the manifest file and loads into DynamoDB
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    s3_interface = S3Interface()
    stage_bucket = S3Configuration().stage_bucket

    dynamo_config = DynamoConfiguration()
    dynamo_interface = DynamoInterface(dynamo_config)

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        bucket = event['body']['bucket']
        manifest_file_key = event['body']['key']
        manifest_file_name = manifest_file_key.split("/")[-1]

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(env).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        ### Download the manifest file to local

        local_path = s3_interface.download_object(bucket, manifest_file_key)

        ### Process the manifest file

        with open(local_path, "r") as raw_file:
            file_names = [
                file_name.strip().split("/")[-1] for file_name in raw_file
            ]

        ### Load data into manifests control table

        for file in file_names:
            item = {
                "dataset_name":
                team + "-" + dataset + "-" + manifest_file_name,
                "datafile_name": manifest_file_name + "-" + file
            }
            dynamo_interface.put_item_in_manifests_control_table(item)

        ### Set s3 path for Copy
        s3_path = 'pre-stage/{}/manifests/{}/{}'.format(
            team, dataset, manifest_file_name)
        kms_key = KMSConfiguration(team).get_kms_arn

        ### Copy Manifest File to team/manifest/dataset location

        s3_interface.copy_object(bucket,
                                 manifest_file_key,
                                 stage_bucket,
                                 s3_path,
                                 kms_key=kms_key)

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)

        processed_keys = [s3_path]

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return processed_keys

Example #8

0

Show file

def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """

    try:
        logger.info('Fetching event data from previous step')
        body = event['body']
        processOutput = body['job']['processOutput']
        processed_keys = processOutput['processed_keys']
        team = body['team']
        pipeline = body['pipeline']
        stage = body['pipeline_stage']
        dataset1 = body['dataset']
        peh_id = body['peh_id']
        prestage_db = processOutput.get('prestage_db', None)
        prestage_table = processOutput.get('prestage_table', None)
        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(body['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info(
            'Storing metadata to DynamoDB and tagging resulting S3 Objects')
        bucket = S3Configuration().stage_bucket
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'size': S3Interface().get_size(bucket, key),
                'last_modified_date':
                S3Interface().get_last_modified(bucket, key),
                'org': body['org'],
                'app': body['app'],
                'env': body['env'],
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset1,
                'stage': 'stage',
                'pipeline_stage': stage,
                'peh_id': peh_id
            }

            dynamo_interface.update_object_metadata_catalog(object_metadata)

            tag_keys = ['org', 'app', 'env', 'team', 'dataset']
            tag_dict = {key: object_metadata[key] for key in tag_keys}
            S3Interface().tag_object(bucket, key, tag_dict)

        # New dependencies
        if body['job']['jobDetails']['num_of_steps'] > 0:
            ssmresponse = ssmcli.get_parameter(
                Name=f'/SDLF/DDB/{team}/{pipeline}/DependenciesByTable')
            ddb_dependencies_by_table = ssmresponse['Parameter']['Value']
            ddb_table = dynamodb.Table(ddb_dependencies_by_table)
            ssmresponse = ssmcli.get_parameter(
                Name=f'/SDLF/DDB/{team}/{pipeline}/Dependencies')
            ddb_dependencies = ssmresponse['Parameter']['Value']
            consulta = f'{prestage_db.lower()}.{prestage_table.lower()}'
            logger.info(consulta)
            response = ddb_table.get_item(Key={'table_name': consulta})
            logger.info(f'Response {response}')
            if 'Item' in response:
                list_transforms = response['Item']['list_transforms']
                num_of_transforms = len(list_transforms)
                logger.debug(f'Response {response}')
                logger.info(
                    f'This table triggers {num_of_transforms} datasets')
                next_stage = chr(ord(stage[-1]) + 1)
                stage_b_message = {}
                dest = {}
                tbls = []
                for dataset in list_transforms:
                    ddb_steps = dynamodb.Table(ddb_dependencies)
                    logger.info(dataset)
                    response = ddb_steps.get_item(Key={'dataset': dataset})
                    logger.info(f'Response {response}')
                    num_of_transforms = len(list_transforms)
                    item = response['Item']
                    dest_table = item['dataset'].split('.')[1]
                    dest_db = item['dataset'].split('.')[0]
                    date_substitutions = replace_decimals(
                        item.get('date_substitutions', []))
                    dependencies = item['dependencies']
                    logger.info(f'Dependencies: {dependencies}')
                    partition = item.get('partitionColumn', '')
                    partition_mask = item.get('partitionPythonMask', None)
                    partition_value_formatted = None
                    for table in dependencies:
                        table_name = table['TableName'].split('.')[1]
                        table_db = table['TableName'].split('.')[0]
                        table_partition = table.get('FieldColumn', '')
                        table_partition_format = table.get(
                            'DateExpression', None)
                        relativedelta_attributes = replace_decimals(
                            table.get('relativedelta_attributes', None))
                        relativedelta_attributes = replace_days(
                            relativedelta_attributes)
                        logger.info(
                            f'relativedelta_attributes={relativedelta_attributes}'
                        )
                        table_partitions = processOutput.get('partitions', [])
                        usage = table.get('Usage', 'validate').lower()
                        if usage == 'validate':
                            if prestage_db.lower() == table_db.lower(
                            ) and prestage_table.lower() == table_name.lower():
                                logger.info(
                                    f'This table does not update/overwrite {dataset} dataset'
                                )
                                break
                            else:
                                logger.debug(
                                    f'Table {table_db}.{table_name} is not the trigger table'
                                )
                        elif prestage_db.lower() == table_db.lower(
                        ) and prestage_table.lower() == table_name.lower():
                            # dst_tbl_partitions = get_table_partitions(prestage_db,prestage_table)
                            partition_value_formatted = ''
                            # If dest table has partitions and source table has partitions
                            logger.debug(
                                f'Partition: {partition}, table_partitions: {table_partitions}'
                            )
                            if table_partitions and table_partition_format is not None:
                                table_partition_value = table_partitions[0][
                                    'value']
                                value = datetime.strptime(
                                    table_partition_value,
                                    table_partition_format)
                                target_value = value + relativedelta(
                                    **relativedelta_attributes)
                                partition_value_formatted = target_value.strftime(
                                    partition_mask)
                                logger.info(
                                    f'This table {usage.upper()} dataset {dest_table} '
                                    f' Partition {partition} = {partition_value_formatted}'
                                )
                                # validate(table_db, table_name, table_partitions)
                            stage_b_message[
                                'prev_stage_processed_keys'] = processed_keys
                            stage_b_message['team'] = team
                            stage_b_message['pipeline'] = pipeline
                            stage_b_message['pipeline_stage'] = ''.join(
                                [stage[:-1], next_stage])
                            stage_b_message['dataset'] = dataset1
                            stage_b_message['org'] = body['org']
                            stage_b_message['app'] = body['app']
                            stage_b_message['env'] = body['env']
                            stage_b_message['behaviour'] = table[
                                'Usage'].lower()
                            stage_b_message['dest_db'] = dest_db
                            stage_b_message['dest_table'] = {}
                            stage_b_message['dest_table']['name'] = dest_table
                            stage_b_message['dest_table'][
                                'part_name'] = partition
                            stage_b_message['dest_table'][
                                'part_value'] = partition_value_formatted
                            stage_b_message['steps'] = item['steps']
                            stage_b_message[
                                'date_substitutions'] = date_substitutions

                    logger.info(
                        'Sending messages to next SQS queue if it exists')
                    logger.info(stage_b_message)
                    sqs_config = SQSConfiguration(
                        team, pipeline, ''.join([stage[:-1], next_stage]))
                    sqs_interface = SQSInterface(
                        sqs_config.get_stage_queue_name)
                    sqs_interface.send_message_to_fifo_queue(
                        json.dumps(stage_b_message),
                        '{}-{}'.format(team, pipeline))

            else:
                logger.info('This table triggers 0 datasets')

        octagon_client.update_pipeline_execution(
            status=f'{stage} {component} Processing', component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment=f'{stage} {component} Error: {repr(e)}')
        raise e
    return 200

Example #9

0

Show file

File: heavy_transform_athena.py Project: emarenas/aws-serverless-data-lake-framework

from dateutil.relativedelta import relativedelta
from datetime import timedelta
#######################################################
# Use S3 Interface to interact with S3 objects
# For example to download/upload them
#######################################################
from datalake_library.commons import init_logger
from datalake_library.configuration.resource_configs import S3Configuration, KMSConfiguration
from datalake_library.interfaces.s3_interface import S3Interface

logger = logging.getLogger()
logger.setLevel(logging.INFO)

s3_interface = S3Interface()
# IMPORTANT: Stage bucket where transformed data must be uploaded
stage_bucket = S3Configuration().stage_bucket
# artifacts bucket where .sql file is stored
artifacts_bucket = S3Configuration().artifacts_bucket
athena_client = boto3.client('athena')
glue_client = boto3.client('glue')


class CustomTransform():
    def __init__(self):
        logger.info("Athena Heavy Transform initiated")

    def transform_object(self, bucket, body, team, dataset):
        # def athena_status(query_execution):

        # retuns table path, or table path with partition name
        # example if table has no partition