コード例 #1
0
def lambda_handler(event, context):
    """ Checks if a dataset is driven by manifest file
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        manifest_flag = event['body']['manifest_enabled']
        manifest_file_pattern = event['body']['manifest_details']['regex_pattern']
        manifest_file_timeout = event['body']['manifest_details']['manifest_timeout']
        manifest_datafile_timeout = event['body']['manifest_details']['manifest_data_timeout']
        input_file_name = event['body']['key'].split('/')[-1]

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (
            octagon.OctagonClient()
            .with_run_lambda(True)
            .with_configuration_instance(event['body']['env'])
            .build()
        )
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        ### Check if the file being processes is the manifest file

        match = re.match(manifest_file_pattern, input_file_name)

        if match:
            is_manifest_file = "True"
        else:
            is_manifest_file = "False"
        
        event['body']['is_manifest_file'] = is_manifest_file

        octagon_client.update_pipeline_execution(status="{} {} Processing".format(stage, component),
                                                 component=component)


    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(component=component,
                                                     issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return event
コード例 #2
0
def lambda_handler(event, context):
    """Checks if the file to be processed is  manifest driven 

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Key(s)
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        keys_to_process = event['body']['keysToProcess']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        ddb_key = team + "-" + dataset

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh_id = octagon_client.start_pipeline_execution(
            pipeline_name='{}-{}-stage-{}'.format(team, pipeline,
                                                  stage[-1].lower()),
            dataset_name='{}-{}'.format(team, dataset),
            comment=event)

        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        response = dynamo_interface.get_transform_table_item(ddb_key)
        logger.info("Querying DynamoDB to check for manifest details")

        event["body"]["manifest_enabled"] = response["manifest_enabled"]
        event["body"]["manifest_details"] = response["manifest_details"]

        # Call custom transform created by user and process the file
        event['body']['peh_id'] = peh_id
        remove_content_tmp()
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        remove_content_tmp()
        raise e
    return event
コード例 #3
0
def lambda_handler(event, context):
    """Calls custom transform developed by user

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Key(s)
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        # keys_to_process = event['body']['keysToProcess']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        stage = stage.replace('A', 'B')
        dataset = event['body']['dataset']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (
            octagon.OctagonClient()
            .with_run_lambda(True)
            .with_configuration_instance(event['body']['env'])
            .build()
        )
        peh_id = octagon_client.start_pipeline_execution(
            pipeline_name='{}-{}-stage-{}'.format(team,
                                                  pipeline, stage[-1].lower()),
            dataset_name='{}-{}'.format(team, dataset),
            comment=event
        )

        # Call custom transform created by user and process the file
        logger.info('Calling user custom processing code')
        transform_handler = TransformHandler().stage_transform(team, dataset, stage)
        response = transform_handler().transform_object(
            bucket, event['body'], team, dataset)  # custom user code called
        response['peh_id'] = peh_id
        remove_content_tmp()
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component), component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(component=component,
                                                     issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        remove_content_tmp()
        raise e
    return response
コード例 #4
0
def lambda_handler(event, context):
    """ Checks if a dataset is driven by manifest file
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        ddb_key = team+"-"+dataset

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (
            octagon.OctagonClient()
            .with_run_lambda(True)
            .with_configuration_instance(event['body']['env'])
            .build()
        )
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)
        
        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        response = dynamo_interface.get_transform_table_item(ddb_key)

        event["body"]["manifest_enabled"] = response["manifest_enabled"]
        event["body"]["manifest_details"] = response["manifest_details"]

        octagon_client.update_pipeline_execution(status="{} {} Processing".format(stage, component),
                                                 component=component)

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(component=component,
                                                     issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return event
コード例 #5
0
def lambda_handler(event, context):
    """Calls custom job waiter developed by user

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket, Key(s) and Job Details
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        keys_to_process = event['body']['key']
        team = event['body']['team']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        job_details = event['body']['job']['jobDetails']
        processed_keys_path = event['body']['job']['processedKeysPath']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())

        logger.info('Checking Job Status with user custom code')
        transform_handler = TransformHandler().stage_transform(
            team, dataset, stage)
        response = transform_handler().check_job_status(
            bucket, keys_to_process, processed_keys_path,
            job_details)  # custom user code called
        response['peh_id'] = event['body']['peh_id']

        if event['body']['job']['jobDetails']['jobStatus'] == 'FAILED':
            peh.PipelineExecutionHistoryAPI(
                octagon_client).retrieve_pipeline_execution(response['peh_id'])
            octagon_client.end_pipeline_execution_failed(
                component=component,
                issue_comment="{} {} Error: Check Job Logs".format(
                    stage, component))
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['peh_id'])
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return response
コード例 #6
0
def lambda_handler(event, context):
    """Calls custom transform developed by user

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Key(s)
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        key = event['body']['key']
        team = event['body']['team']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        ddb_key = event['body']['manifest_ddb_key']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['peh_id'])

        # Call custom transform created by user and process the file
        logger.info('Calling user custom processing code')
        transform_handler = TransformHandler().stage_transform(
            team, dataset, stage)
        response = transform_handler().transform_object(
            bucket, key, team, dataset)  # custom user code called
        remove_content_tmp()
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
        dynamo_interface.update_manifests_control_table_stagea(
            ddb_key, "PROCESSING", response[0])
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        remove_content_tmp()
        dynamo_interface.update_manifests_control_table_stagea(
            ddb_key, "FAILED")
        raise e
    return response
コード例 #7
0
def lambda_handler(event, context):
    """Updates the objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on S3 event
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Key
    """
    try:
        logger.info('Fetching event data from previous step')
        object_metadata = json.loads(event)
        stage = object_metadata['pipeline_stage']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(object_metadata['env']).build())
        object_metadata['peh_id'] = octagon_client.start_pipeline_execution(
            pipeline_name='{}-{}-stage-{}'.format(object_metadata['team'],
                                                  object_metadata['pipeline'],
                                                  stage[-1].lower()),
            dataset_name='{}-{}'.format(object_metadata['team'],
                                        object_metadata['dataset']),
            comment=event)
        # Add business metadata (e.g. object_metadata['project'] = 'xyz')

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info('Storing metadata to DynamoDB')
        dynamo_interface.update_object_metadata_catalog(object_metadata)

        logger.info(
            'Passing arguments to the next function of the state machine')
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return {'statusCode': 200, 'body': object_metadata}
コード例 #8
0
def lambda_handler(event, context):
    """Crawl Data using specified Glue Crawler

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Keys Path
    """
    try:
        logger.info('Fetching event data from previous step')
        team = event['body']['team']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['job']['peh_id'])

        crawler_name = '-'.join(['sdlf', team, dataset, 'post-stage-crawler'])
        logger.info('Starting Crawler {}'.format(crawler_name))
        try:
            client.start_crawler(Name=crawler_name)
        except client.exceptions.CrawlerRunningException:
            logger.info('Crawler is already running')
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return 200
コード例 #9
0
def lambda_handler(event, context):
    """ Process the manifest file and loads into DynamoDB
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    s3_interface = S3Interface()
    stage_bucket = S3Configuration().stage_bucket

    dynamo_config = DynamoConfiguration()
    dynamo_interface = DynamoInterface(dynamo_config)

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        bucket = event['body']['bucket']
        manifest_file_key = event['body']['key']
        manifest_file_name = manifest_file_key.split("/")[-1]

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(env).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        ### Download the manifest file to local

        local_path = s3_interface.download_object(bucket, manifest_file_key)

        ### Process the manifest file

        with open(local_path, "r") as raw_file:
            file_names = [
                file_name.strip().split("/")[-1] for file_name in raw_file
            ]

        ### Load data into manifests control table

        for file in file_names:
            item = {
                "dataset_name":
                team + "-" + dataset + "-" + manifest_file_name,
                "datafile_name": manifest_file_name + "-" + file
            }
            dynamo_interface.put_item_in_manifests_control_table(item)

        ### Set s3 path for Copy
        s3_path = 'pre-stage/{}/manifests/{}/{}'.format(
            team, dataset, manifest_file_name)
        kms_key = KMSConfiguration(team).get_kms_arn

        ### Copy Manifest File to team/manifest/dataset location

        s3_interface.copy_object(bucket,
                                 manifest_file_key,
                                 stage_bucket,
                                 s3_path,
                                 kms_key=kms_key)

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)

        processed_keys = [s3_path]

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return processed_keys
コード例 #10
0
def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with response
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        processed_keys_path = event['body']['job']['processedKeysPath']
        processed_keys = S3Interface().list_objects(bucket,
                                                    processed_keys_path)
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['job']['peh_id']
        keys_to_process = event['body']['keysToProcess']
        s3_path = "post-stage/{}/manifests/{}/{}".format(
            team, dataset, keys_to_process[0].split("/")[-1])

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info('Storing metadata to DynamoDB')
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'size': S3Interface().get_size(bucket, key),
                'last_modified_date':
                S3Interface().get_last_modified(bucket, key),
                'org': event['body']['org'],
                'app': event['body']['app'],
                'env': event['body']['env'],
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset,
                'stage': 'stage',
                'pipeline_stage': stage,
                'peh_id': peh_id
            }
            dynamo_interface.update_object_metadata_catalog(object_metadata)

        logger.info("Updating manifests control table")
        items = get_manifest_data(bucket, team, dataset, keys_to_process[0])
        ddb_keys = get_ddb_keys(items)

        for ddb_key in ddb_keys:
            dynamo_interface.update_manifests_control_table_stageb(
                ddb_key, "COMPLETED")

        logger.info("Move manifest file to post stage")
        kms_key = KMSConfiguration(team).get_kms_arn
        s3_interface = S3Interface()
        s3_interface.copy_object(bucket,
                                 keys_to_process[0],
                                 bucket,
                                 s3_path,
                                 kms_key=kms_key)

        logger.info("Removing manifest file from pre-stage")

        s3_interface.delete_objects(bucket, keys_to_process[0])

        # Only uncomment if a queue for the next stage exists
        # logger.info('Sending messages to next SQS queue if it exists')
        # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)]))
        # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name)
        # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset))

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        for ddb_key in ddb_keys:
            dynamo_interface.update_manifests_control_table_stageb(
                ddb_key, "FAILED", None, "Failed in Post Update")
        raise e
    return 200
コード例 #11
0
def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    try:
        logger.info('Fetching event data from previous step')
        processed_keys = event['body']['processedKeys']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info('Storing metadata to DynamoDB')
        bucket = S3Configuration().stage_bucket
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'size': S3Interface().get_size(bucket, key),
                'last_modified_date':
                S3Interface().get_last_modified(bucket, key),
                'org': event['body']['org'],
                'app': event['body']['app'],
                'env': event['body']['env'],
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset,
                'stage': 'stage',
                'pipeline_stage': stage,
                'peh_id': peh_id
            }

            dynamo_interface.update_object_metadata_catalog(object_metadata)

        #Workload management changes
        #---------------------------
        wlm_ddb_table = dynamo_interface.wlm_control_table
        item = dynamo_interface.get_item(
            wlm_ddb_table, {
                "name":
                "{}-{}-{}".format(team, dataset,
                                  processed_keys[0].split("/")[-2])
            })
        priority = item.get('priority', None)
        print(priority)
        #---------------------------

        logger.info('Sending messages to next SQS queue if it exists')
        sqs_config = SQSConfiguration(team, dataset, ''.join(
            [stage[:-1], chr(ord(stage[-1]) + 1)]),
                                      priority)  #Workload management changes
        sqs_interface = SQSInterface(
            sqs_config.get_stage_queue_name_wlm)  #Workload management changes
        sqs_interface.send_batch_messages_to_fifo_queue(
            processed_keys, 10, '{}-{}'.format(team, dataset))

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return 200
コード例 #12
0
def lambda_handler(event, context):
    """Compile Data to a CSV with Topic Model Output

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Keys Path
    """
    try:
        # Get Information about the Step Function
        logger.info('Fetching event data from previous step')
        team = event['body']['team']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        bucket = event['body']['bucket']

        # Start Connection to Octagon Client
        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['job']['peh_id'])

        logger.info('Starting to Compile Results')

        # Here we will get associate topics and add to our existing metadata
        # for each of the abstract text files  we have in the pre-stage bucket:

        # Get the s3 location of the zipped topic model output
        key = "post-stage/{}/{}/".format(team, dataset)
        my_bucket = client.list_objects_v2(Bucket=bucket, Prefix=key)
        for objects in my_bucket["Contents"]:
            if ".tar.gz" in objects["Key"]:
                key = (objects["Key"])

        # Extract the Topic Model Data from the zipped file
        s3_object = client.get_object(Bucket=bucket, Key=key)
        wholefile = s3_object['Body'].read()
        fileobj = io.BytesIO(wholefile)
        tarf = tarfile.open(fileobj=fileobj)
        csv_files = [
            f.name for f in tarf.getmembers() if f.name.endswith('.csv')
        ]

        # Read in both the Doc-Topics and Topic-Terms csv files using Pandas DataFrames
        #   doc-topics.csv (The topics for each abstract document)
        #   topic-terms.csv (The terms associated to each topic - up to 10 terms)
        for i in csv_files:
            if "doc-topics" in i:
                csv_contents = tarf.extractfile(i).read()
                doc_topics = pd.read_csv(io.BytesIO(csv_contents),
                                         encoding='utf8')

            if "topic-terms" in i:
                csv_contents1 = tarf.extractfile(i).read()
                topic_terms = pd.read_csv(io.BytesIO(csv_contents1),
                                          encoding='utf8')

        # Group All of the Topics as a List for Each Abstract Docname
        doc_topics_grouped = doc_topics.groupby("docname")["topic"].apply(
            list).reset_index(name='topic_list')

        # Group All of the Terms Associated to each of the Topics Found
        topic_terms_grouped = topic_terms.groupby("topic")["term"].apply(
            list).reset_index(name='term_list')

        # For Each Abstract We Will Add a Column with the Associated Topic Terms (i.e. 'term_list')
        main_list = []
        for index, row in doc_topics_grouped.iterrows():
            labels = []
            for topic in row[1]:
                l = topic_terms_grouped.loc[topic][1]
                labels.extend(l)
            main_list.append(labels)
        doc_topics_grouped['term_list'] = main_list

        # Now Lets Pull All the PreStage Metadata we Have for Each Abstract Document:

        # List csv Files in the Pre-stage Bucket
        key = "pre-stage/{}/{}/medical_data".format(team, dataset)
        response = client.list_objects_v2(Bucket=bucket, Prefix=key)

        # Combine All the Metadata into one Large Pandas DataFrame
        count = 0
        for contents in response['Contents']:
            if contents['Size'] > 0:
                if count < 1:
                    obj = client.get_object(Bucket=bucket, Key=contents["Key"])
                    metadata = pd.read_csv(io.BytesIO(obj['Body'].read()),
                                           encoding='utf8')
                else:
                    obj = client.get_object(Bucket=bucket, Key=contents["Key"])
                    df = pd.read_csv(io.BytesIO(obj['Body'].read()),
                                     encoding='utf8')
                    metadata = metadata.append(df, ignore_index=True)
                count = count + 1

        # IMPORTANT: Now we can merge the Topics and Terms
        # we found for each document with the existing Metadata
        doc_topics_final = pd.merge(metadata, doc_topics_grouped, on='docname')

        # We will also create a training data csv (including topics and text only) so new documents
        # can be associated to one of these topics using  Multi-Label Classification:
        label_list = []
        for index, row in doc_topics_final.iterrows():
            if len(doc_topics_final["topic_list"][index]) > 1:
                listToStr = '|'.join([
                    str(elem) for elem in doc_topics_final["topic_list"][index]
                ])
                label_list.append(listToStr)
            else:
                label_list.append(str(
                    doc_topics_final["topic_list"][index][0]))

        # Create Training Data DataFrame from the two columns
        training_data = pd.DataFrame(list(
            zip(label_list, doc_topics_final["abstract"])),
                                     columns=['Labels', 'Abstracts'])

        # Get KMS Key to Encrypt Data
        kms_key = KMSConfiguration(team).get_kms_arn

        # Write Our DataFrames with Output to S3 Post-Stage:
        # Write Training data to s3 Post-Stage Bucket
        output_path = "training_data.csv"
        s3_path_key = "post-stage/{}/{}/multilabel_classification/{}".format(
            team, dataset, output_path)
        training_data.to_csv('/tmp/' + output_path, index=False, header=False)
        s3_interface.upload_object('/tmp/' + output_path,
                                   bucket,
                                   s3_path_key,
                                   kms_key=kms_key)

        # Write Final df to s3 Post-Stage Bucket
        output_path = "compile_topics_data.csv"
        s3_path_key = "post-stage/{}/{}/{}".format(team, dataset, output_path)
        doc_topics_final.to_csv('/tmp/' + output_path)
        s3_interface.upload_object('/tmp/' + output_path,
                                   bucket,
                                   s3_path_key,
                                   kms_key=kms_key)

        # Write doc_topics df to s3 Post-Stage Bucket
        output_path = "doc_topics.csv"
        s3_path_key = "post-stage/{}/{}/topic_data/{}".format(
            team, dataset, output_path)
        doc_topics.to_csv('/tmp/' + output_path)
        s3_interface.upload_object('/tmp/' + output_path,
                                   bucket,
                                   s3_path_key,
                                   kms_key=kms_key)

        # Write topic_terms df to s3 Post-Stage Bucket
        output_path = "topic_terms.csv"
        s3_path_key = "post-stage/{}/{}/topic_data/{}".format(
            team, dataset, output_path)
        topic_terms.to_csv('/tmp/' + output_path)
        s3_interface.upload_object('/tmp/' + output_path,
                                   bucket,
                                   s3_path_key,
                                   kms_key=kms_key)

        # Update Pipeline Execution in Octagon
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return 200
コード例 #13
0
def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with response
    """
    def replace_decimals(obj):
        if isinstance(obj, list):
            for i in range(len(obj)):
                obj[i] = replace_decimals(obj[i])
            return obj
        elif isinstance(obj, dict):
            for k, v in obj.items():
                obj[k] = replace_decimals(v)
            return obj
        elif isinstance(obj, set):
            return set(replace_decimals(i) for i in obj)
        elif isinstance(obj, decimal.Decimal):
            if obj % 1 == 0:
                return int(obj)
            else:
                return float(obj)
        else:
            return obj

    def get_table_partitions(db, tbl):
        glue_response = glue_client.get_table(DatabaseName=db, Name=tbl)
        logger.debug('Glue get_table response: {}'.format(glue_response))
        return glue_response['Table']['PartitionKeys']

    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        processed_keys_path = event['body']['job']['processedKeysPath']
        processed_keys = S3Interface().list_objects(bucket,
                                                    processed_keys_path)
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset1 = event['body']['dataset']
        peh_id = event['body']['job']['peh_id']
        env = event['body']['env']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info(
            'Storing metadata to DynamoDB and tagging resulting S3 Objects')
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'size': S3Interface().get_size(bucket, key),
                'last_modified_date':
                S3Interface().get_last_modified(bucket, key),
                'org': event['body']['org'],
                'app': event['body']['app'],
                'env': event['body']['env'],
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset1,
                'stage': 'stage',
                'pipeline_stage': stage,
                'peh_id': peh_id
            }
            dynamo_interface.update_object_metadata_catalog(object_metadata)

            tag_keys = ['org', 'app', 'env', 'team', 'dataset']
            tag_dict = {key: object_metadata[key] for key in tag_keys}
            S3Interface().tag_object(bucket, key, tag_dict)

        # Only uncomment if a queue for the next stage exists
        # logger.info('Sending messages to next SQS queue if it exists')
        # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)]))
        # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name)
        # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset))
        prestage_table = event['body']['dest_table']['name']
        prestage_db = event['body']['dest_db']
        dest_part_name = event['body']['dest_table']['part_name']
        dest_part_value = event['body']['dest_table']['part_value']
        processOutput = {}
        if dest_part_name is not '' and dest_part_value is not '':
            partitions = []
            part_dict = {"name": dest_part_name, "value": dest_part_value}
            partitions.append(part_dict)
            processOutput['partitions'] = partitions
        processOutput['processed_keys'] = processed_keys
        ssmresponse = ssmcli.get_parameter(
            Name=f'/SDLF/DDB/{team}/{pipeline}/DependenciesByTable')
        ddb_dependencies_by_table = ssmresponse['Parameter']['Value']
        ddb_table = dynamodb.Table(ddb_dependencies_by_table)
        ssmresponse = ssmcli.get_parameter(
            Name=f'/SDLF/DDB/{team}/{pipeline}/Dependencies')
        ddb_dependencies = ssmresponse['Parameter']['Value']
        consulta = f'{prestage_db.lower()}.{prestage_table.lower()}'
        logger.info(consulta)
        response = ddb_table.get_item(Key={'table_name': consulta})
        logger.info(f'Response {response}')
        if 'Item' in response:
            list_transforms = response['Item']['list_transforms']
            num_of_transforms = len(list_transforms)
            logger.debug(f'Response {response}')
            logger.info(f'This table triggers {num_of_transforms} datasets')
            next_stage = 'B'
            stage_b_message = {}
            for dataset in list_transforms:
                ddb_steps = dynamodb.Table(ddb_dependencies)
                logger.info(dataset)
                response = ddb_steps.get_item(Key={'dataset': dataset})
                logger.info(f'Response {response}')
                num_of_transforms = len(list_transforms)
                item = response['Item']
                dest_table = item['dataset'].split('.')[1]
                dest_db = item['dataset'].split('.')[0]
                dependencies = item['dependencies']
                date_substitutions = replace_decimals(
                    item.get('date_substitutions', []))
                logger.info(f'Dependencies: {dependencies}')
                partition = item.get('partitionColumn', '')
                partition_mask = item.get('partitionPythonMask', None)
                partition_value_formatted = None
                table_check = []
                for table in dependencies:
                    table_name = table['TableName'].split('.')[1]
                    table_db = table['TableName'].split('.')[0]
                    table_partition = table.get('FieldColumn', '')
                    table_partition_format = table.get('DateExpression', None)
                    relativedelta_attributes = replace_decimals(
                        table.get('relativedelta_attributes', None))
                    table_partitions = processOutput.get('partitions', [])
                    usage = table.get('Usage', 'validate').lower()
                    if usage == 'validate':
                        if prestage_db == table_db and prestage_table == table_name:
                            logger.info(
                                f'This table does not update/overwrite {dataset} dataset'
                            )
                            break
                        else:
                            logger.debug(
                                f'Table {table_db}.{table_name} is not the trigger table'
                            )
                    else:
                        if prestage_db.lower() == table_db.lower(
                        ) and prestage_table.lower() == table_name.lower():
                            # dst_tbl_partitions = get_table_partitions(prestage_db,prestage_table)
                            partition_value_formatted = ''
                            # If dest table has partitions and source table has partitions
                            logger.debug(
                                f'Partition: {partition}, table_partitions: {table_partitions}'
                            )
                            if table_partitions and table_partition_format is not None:
                                table_partition_value = table_partitions[0][
                                    'value']
                                value = datetime.strptime(
                                    table_partition_value,
                                    table_partition_format)
                                target_value = value - relativedelta(
                                    **relativedelta_attributes)
                                partition_value_formatted = target_value.strftime(
                                    partition_mask)
                                logger.info(
                                    f'This table {usage.upper()} dataset {dest_table} '
                                    f' Partition {partition} = {partition_value_formatted}'
                                )
                                # validate(table_db, table_name, table_partitions)
                            stage_b_message[
                                'prev_stage_processed_keys'] = processed_keys
                            stage_b_message['team'] = team
                            stage_b_message['pipeline'] = pipeline
                            stage_b_message['pipeline_stage'] = ''.join(
                                [stage[:-1], next_stage])
                            stage_b_message['dataset'] = dataset1
                            stage_b_message['org'] = event['body']['org']
                            stage_b_message['app'] = event['body']['app']
                            stage_b_message['env'] = event['body']['env']
                            stage_b_message['behaviour'] = table[
                                'Usage'].lower()
                            stage_b_message['dest_db'] = dest_db
                            stage_b_message['dest_table'] = {}
                            stage_b_message['dest_table']['name'] = dest_table
                            stage_b_message['dest_table'][
                                'part_name'] = partition
                            stage_b_message['dest_table'][
                                'part_value'] = partition_value_formatted
                            stage_b_message['steps'] = item['steps']
                            stage_b_message[
                                'date_substitutions'] = date_substitutions
                            logger.info(
                                'Sending messages to next SQS queue if it exists'
                            )
                            # GEt queue by SSM
                            logger.info(stage_b_message)
                            sqs_config = SQSConfiguration(
                                team, pipeline, stage)
                            sqs_interface = SQSInterface(
                                sqs_config.get_stage_queue_name)
                            sqs_interface.send_message_to_fifo_queue(
                                json.dumps(stage_b_message),
                                '{}-{}'.format(team, pipeline))
                            break

        else:
            logger.info(f'This table triggers 0 datasets')

        octagon_client.update_pipeline_execution(
            status=f'{stage} {component} Processing', component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment=f'{stage} {component} Error: {repr(e)}')
        raise e
    return 200
コード例 #14
0
def lambda_handler(event, context):
    """ Load Datafile metadata in manifests control table
        Check if manifest file is available within the threshold
    
    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with outcome of the process
    """
    s3_interface = S3Interface()
    stage_bucket = S3Configuration().stage_bucket

    dynamo_config = DynamoConfiguration()
    dynamo_interface = DynamoInterface(dynamo_config)
    current_time = dt.datetime.utcnow()
    current_timestamp = current_time.timestamp()

    try:
        logger.info("Fetching event data from previous step")
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        env = event['body']['env']
        bucket = event['body']['bucket']
        input_file_key = event['body']['key']
        input_file_name = input_file_key.split("/")[-1]
        manifest_file_pattern = event['body']['manifest_details'][
            'regex_pattern']
        manifest_timeout = int(
            event['body']['manifest_details']['manifest_timeout'])

        if 'manifest_interval' in event['body']:
            manifest_interval = event['body']['manifest_interval']
        else:
            manifest_interval = current_timestamp

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(env).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)

        ### List S3 Objects for the manifest file in the manifest prefix
        ### For this to work the manifest should have been loaded into DynamoDB

        manifest_key = "pre-stage/{}/manifests/{}/".format(team, dataset)
        processed_manifest_keys = s3_interface.list_objects(
            stage_bucket, manifest_key)

        matched_keys = []
        items = []

        if not processed_manifest_keys:
            logger.info(
                "Manifest File has not been loaded, sleeping for 5 mins")
            time.sleep(300)
            manifest_file_loaded = "False"

        else:
            for manifest_file_key in processed_manifest_keys:
                manifest_file_name = manifest_file_key.split("/")[-1]
                match = re.match(manifest_file_pattern, manifest_file_name)
                if match:
                    matched_keys.append(manifest_file_name)

                ### Query Manifests Control table
                for keys in matched_keys:
                    dataset_name = team + "-" + dataset
                    try:
                        items.append(
                            dynamo_interface.
                            get_item_from_manifests_control_table(
                                dataset_name, keys, input_file_name))
                    except KeyError:
                        logger.info(
                            "Manifest File has not been loaded, sleeping for 5 mins"
                        )
                        manifest_file_loaded = "False"

                ### Update Manifests Control table

                if not items:
                    logger.info(
                        "Manifest File has not been loaded, sleeping for 5 mins"
                    )
                    time.sleep(300)
                    manifest_file_loaded = "False"
                else:
                    ddb_key = {
                        'dataset_name': items[0]['dataset_name'],
                        'datafile_name': items[0]['datafile_name']
                    }
                    STATUS = "STARTED"
                    dynamo_interface.update_manifests_control_table_stagea(
                        ddb_key, STATUS)
                    manifest_file_loaded = "True"
                    event['body']['manifest_ddb_key'] = ddb_key

        ### Check if Manifest threshold has exceeded

        if current_timestamp == manifest_interval:
            current_timestamp = dt.datetime.utcnow().timestamp()

        if int(
            (current_timestamp - manifest_interval) / 60) >= manifest_timeout:
            logger.error("Manifest Threshold Breached")
            raise Exception("Manifest Threshold Breached")

        event['body']['manifest_interval'] = manifest_interval
        event['body']['manifest_file_loaded'] = manifest_file_loaded

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e

    return event
コード例 #15
0
def lambda_handler(event, context):
    """Checks dependent datasets status

    Arguments:
        event {dict} -- Dictionary with details on datasets dependency
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with details on datasets dependency
    """
    try:
        logger.info("Dataset dependency Lambda")
        bucket = event['body']['bucket']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        env = event['body']['env']
        dependent_stage = event['body']['dependent_stage']
        retry_count = event['body']["retry_count"]

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(env).build())
        if 'peh_id' not in event['body']:
            peh_id = octagon_client.start_pipeline_execution(
                pipeline_name='{}-{}-stage-{}'.format(team, pipeline,
                                                      stage[-1].lower()),
                dataset_name='{}-{}'.format(team, dataset),
                comment=event)
        else:
            peh_id = event['body']['peh_id']
            octagon.peh.PipelineExecutionHistoryAPI(
                octagon_client).retrieve_pipeline_execution(peh_id)

        logger.info("Checking dependent tables status")
        dependent_datasets = get_dependent_datasets(team, dataset)

        atomic_completed_datasets_count = 0
        for each_dataset in dependent_datasets:
            output = get_dynamodb_peh_status(env,
                                             dependent_datasets[each_dataset],
                                             dependent_stage,
                                             get_current_date())
            if output == "COMPLETED":
                atomic_completed_datasets_count += 1

        dependent_datasets_status = "SUCCEEDED" if len(
            dependent_datasets
        ) == atomic_completed_datasets_count else "FAILED"

        octagon_client.update_pipeline_execution(
            status="{} {} Dependent Datasets Status".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return {
        "body": {
            "bucket": bucket,
            "team": team,
            "pipeline": pipeline,
            "pipeline_stage": stage,
            "dataset": dataset,
            "env": env,
            "dependent_stage": dependent_stage,
            "retry_count": retry_count + 1,
            "dependent_datasets_status": dependent_datasets_status,
            "peh_id": peh_id
        }
    }
コード例 #16
0
def lambda_handler(event, context):
    """Write Metadata JSON Files for Data Source

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Keys Path
    """
    try:
        logger.info('Fetching event data from previous step')
        team = event['body']['team']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        bucket = event['body']['bucket']

        # This Stage will Add Metadata Directory
        # (NOTE: We can use metadata to filter queries in Amazon Kendra):

        # Add a Metadata Directory for a s3 location to write json files
        directory_key = "pre-stage/{}/{}/datasource_metadata/".format(
            team, dataset)
        s3client.put_object(Bucket=bucket, Key=directory_key)

        # Get KMS Key to Encrypt Data
        kms_key = KMSConfiguration(team).get_kms_arn

        # Read in our compiled metadata and topic data in a DataFrame
        key = "post-stage/{}/{}/compile_topics_data.csv".format(team, dataset)
        obj = s3client.get_object(Bucket=bucket, Key=key)
        metadata = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')

        # Add A Dictionary to Pass JSON Strucutre Parameters for Each
        # Lambda Invocation (one for ever 10,000 rows so no timeouts)
        rows = metadata["abstract"].count()
        invocations = int((rows / 10000) + 1)
        jobs = {}
        jobList = []
        for i in range(0, invocations):
            # Set Start and End Rows for each Lambda
            start = i * 10000

            if (i + 1) == invocations:
                end = rows
            else:
                end = (i + 1) * 10000

            # Send a Payload with the s3 path to write and the start/end row count
            payload = {
                "start": str(start),
                "end": str(end),
                "key": key,
                "bucket": bucket,
                "directory_key": directory_key,
                "team": team,
                "dataset": dataset
            }
            jobList.append(payload)

        jobs["jobList"] = jobList

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['job']['peh_id'])

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)

    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return jobs
コード例 #17
0
def lambda_handler(event, context):
    """Checks if the file to be processed is  manifest driven 

    Arguments:
        event {dict} -- Dictionary with details on previous processing step
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with Processed Bucket and Key(s)
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        keys_to_process = event['body']['keysToProcess']
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']
        peh_id = event['body']['peh_id']
        manifest_data_timeout = int(
            event['body']['manifest_details']['manifest_data_timeout'])
        current_time = dt.datetime.utcnow()
        current_timestamp = current_time.timestamp()

        if 'manifest_interval' in event['body']:
            manifest_interval = event['body']['manifest_interval']
        else:
            manifest_interval = current_timestamp

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())

        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(peh_id)

        ### Set max_items_process in datasets table so that the statemachine only processes 1 manifest file at a time

        ddb_keys = get_ddb_keys(keys_to_process, bucket, team, dataset)

        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        ### Query Manifest Control Table to get the status
        items = []

        logger.info(
            "Querying DynamoDB to check data in manifests control table for Stage A status"
        )

        for ddb_key in ddb_keys:
            try:
                items.append(
                    dynamo_interface.get_item_from_manifests_control_table(
                        ddb_key["dataset_name"], ddb_key["manifest_file_name"],
                        ddb_key["datafile_name"]))
            except KeyError:
                logger.error(
                    "The manifest file has not been processed in Stage A")
                raise Exception(
                    "Manifest File has not been processed in Stage A")

        ### Check stage a status for data files
        logger.info(
            "Checking to see if all the files have been processed in Stage A")

        status_message_list = []
        failed_status_message_list = []
        wait_message_counter = 0
        failed_message_counter = 0

        for item in items:
            if "stage_a_status" in item:
                stage_a_status = item["stage_a_status"]
            else:
                stage_a_status = "NOT STARTED"

            if stage_a_status != "COMPLETED" and stage_a_status != "FAILED":
                status_message_list.append("Waiting for Data File {}".format(
                    item["datafile_name"].split("-")[-1]))
                wait_message_counter += 1

            elif stage_a_status == "FAILED":
                failed_status_message_list.append(
                    "Data Files Failed in Stage A {}".format(
                        item["datafile_name"].split("-")[-1]))
                failed_message_counter += 1

        if failed_message_counter > 0:
            logger.error("Data File Failure in Stage A, Processing will stop")
            logger.error("The following files have failed in Stage A")
            for message in failed_status_message_list:
                logger.error(message)
            ### Update manifest control table, mark all files as failed in Stage B
            for ddb_key in ddb_keys:
                update_key = dynamo_interface.manifest_keys(
                    ddb_key["dataset_name"], ddb_key["manifest_file_name"],
                    ddb_key["datafile_name"])
                dynamo_interface.update_manifests_control_table_stageb(
                    update_key, "FAILED", None, "Datafile Failed in Stage A")
            raise Exception("Data File Failure in Stage A")

        if wait_message_counter > 0:
            logger.info("Waiting for Data Files to be processed in Stage A")
            for message in status_message_list:
                logger.info(message)
            logger.info("Will sleep for 5 mins")
            time.sleep(300)
            data_file_wait = "True"
            if manifest_interval == current_timestamp:
                current_timestamp = dt.datetime.utcnow().timestamp()

            if int((current_timestamp - manifest_interval) /
                   60) >= manifest_data_timeout:
                logger.error("Data File Threshold Breached")
                logger.error("Stage B Processing Will Stop Now")
                data_file_wait = "False"
                for message in status_message_list:
                    logger.error(message)
                ### Update manifest control table, mark all files as failed in Stage B
                for ddb_key in ddb_keys:
                    update_key = dynamo_interface.manifest_keys(
                        ddb_key["dataset_name"], ddb_key["manifest_file_name"],
                        ddb_key["datafile_name"])
                    dynamo_interface.update_manifests_control_table_stageb(
                        update_key, "FAILED", None,
                        "Datafile threshold Breached")
                raise Exception("Data File Threshold Breached")
        else:
            logger.info("All files processed in Stage A")
            data_file_wait = "False"
            for ddb_key in ddb_keys:
                update_key = dynamo_interface.manifest_keys(
                    ddb_key["dataset_name"], ddb_key["manifest_file_name"],
                    ddb_key["datafile_name"])
                dynamo_interface.update_manifests_control_table_stageb(
                    update_key, "STARTED")

        event["body"]["manifest_interval"] = manifest_interval
        event["body"]["data_file_wait"] = data_file_wait

        remove_content_tmp()
        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        remove_content_tmp()
        raise e
    return event
コード例 #18
0
def lambda_handler(event, context):
    """Updates the S3 objects metadata catalog

    Arguments:
        event {dict} -- Dictionary with details on Bucket and Keys
        context {dict} -- Dictionary with details on Lambda context

    Returns:
        {dict} -- Dictionary with response
    """
    try:
        logger.info('Fetching event data from previous step')
        bucket = event['body']['bucket']
        processed_keys_path = event['body']['job']['processedKeysPath']
        processed_keys = S3Interface().list_objects(bucket,
                                                    processed_keys_path)
        team = event['body']['team']
        pipeline = event['body']['pipeline']
        stage = event['body']['pipeline_stage']
        dataset = event['body']['dataset']

        logger.info('Initializing Octagon client')
        component = context.function_name.split('-')[-2].title()
        octagon_client = (octagon.OctagonClient().with_run_lambda(
            True).with_configuration_instance(event['body']['env']).build())
        peh.PipelineExecutionHistoryAPI(
            octagon_client).retrieve_pipeline_execution(
                event['body']['job']['peh_id'])

        logger.info('Initializing DynamoDB config and Interface')
        dynamo_config = DynamoConfiguration()
        dynamo_interface = DynamoInterface(dynamo_config)

        logger.info('Storing metadata to DynamoDB')
        for key in processed_keys:
            object_metadata = {
                'bucket': bucket,
                'key': key,
                'team': team,
                'pipeline': pipeline,
                'dataset': dataset,
                'peh_id': event['body']['job']['peh_id'],
                'stage': 'post-stage'
            }
            dynamo_interface.update_object_metadata_catalog(object_metadata)

        # Add Tables to Result Path to Enable Deequ Job
        table_path = "compile_topics_data_csv"
        tables = [table_path]

        # Only uncomment if using Kendra and index and data source ALREADY created
        # Data Sync Job
        # kendra_client = boto3.client('kendra')
        # response = kendra_client.start_data_source_sync_job(
        #         Id='ENTER_DATASOURCE_ID',
        #         IndexId='ENTER_INDEX_ID''
        #         )

        # Only uncomment if a queue for the next stage exists
        # logger.info('Sending messages to next SQS queue if it exists')
        # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)]))
        # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name)
        # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset))

        octagon_client.update_pipeline_execution(
            status="{} {} Processing".format(stage, component),
            component=component)
        octagon_client.end_pipeline_execution_success()
    except Exception as e:
        logger.error("Fatal error", exc_info=True)
        octagon_client.end_pipeline_execution_failed(
            component=component,
            issue_comment="{} {} Error: {}".format(stage, component, repr(e)))
        raise e
    return tables