def lambda_handler(event, context): try: team = os.environ['TEAM'] pipeline = os.environ['PIPELINE'] dataset = event['dataset'] stage = os.environ['STAGE'] state_config = StateMachineConfiguration(team, pipeline, stage) sqs_config = SQSConfiguration(team, dataset, stage) dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) messages = dlq_interface.receive_messages(1) if not messages: logger.info('No messages found in {}'.format( sqs_config.get_stage_dlq_name)) return logger.info('Received {} messages'.format(len(messages))) for message in messages: logger.info('Starting State Machine Execution') if isinstance(message.body, str): response = json.loads(message.body) StatesInterface().run_state_machine( state_config.get_stage_state_machine_arn, response) message.delete() logger.info('Delete message succeeded') except Exception as e: logger.error("Fatal error", exc_info=True) raise e return
def lambda_handler(event, context): try: if isinstance(event, str): event = json.loads(event) sqs_config = SQSConfiguration(event['team'], event['pipeline'], event['pipeline_stage']) sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name) logger.info('Execution Failed. Sending original payload to DLQ') sqs_interface.send_message_to_fifo_queue(json.dumps(event), 'failed') except Exception as e: logger.error("Fatal error", exc_info=True) raise e return
def lambda_handler(event, context): try: logger.info('Fetching event data from previous step') processed_keys = event['body']['keysToProcess'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] bucket = event['body']['bucket'] logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) wlm_ddb_table = dynamo_interface.wlm_control_table item = dynamo_interface.get_item(wlm_ddb_table, {"name":"{}-{}-{}".format(team, dataset,processed_keys[0].split("/")[-2])}) priority = item.get('priority', None) print(priority) print(''.join( [stage[:-1], chr(ord(stage[-1]))])) logger.info('Sending messages to right priority SQS queue') sqs_config = SQSConfiguration(team, dataset, ''.join( [stage[:-1], chr(ord(stage[-1]))]), priority) #Workload management changes sqs_interface = SQSInterface(sqs_config.get_stage_queue_name_wlm) #Workload management changes sqs_interface.send_message_to_fifo_queue(json.dumps(event), '{}-{}'.format(team, dataset)) logger.info("lambda Completed") return { 'statusCode': 200 } except Exception as e: raise e
def lambda_handler(event, context): try: sqs_config = SQSConfiguration( os.environ['TEAM'], os.environ['PIPELINE'], os.environ['STAGE']) dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) messages = dlq_interface.receive_messages(1) if not messages: logger.info('No messages found in {}'.format( sqs_config.get_stage_dlq_name)) return logger.info('Received {} messages'.format(len(messages))) queue_interface = SQSInterface(sqs_config.get_stage_queue_name) for message in messages: queue_interface.send_message_to_fifo_queue(message.body, 'redrive') message.delete() logger.info('Delete message succeeded') except Exception as e: logger.error("Fatal error", exc_info=True) raise e return
def lambda_handler(event, context): """Checks if any items need processing and triggers state machine Arguments: event {dict} -- Dictionary with no relevant details context {dict} -- Dictionary with details on Lambda context """ # TODO Implement Redrive Logic (through message_group_id) try: team = event['team'] pipeline = event['pipeline'] stage = event['pipeline_stage'] dataset = event['dataset'] org = event['org'] app = event['app'] env = event['env'] stage_bucket = S3Configuration().stage_bucket sqs_config = SQSConfiguration(team, dataset, stage) #Workload management changes #--------------------------- #--------------------------- max_sfn_executions = 3 state_config = StateMachineConfiguration(team, pipeline, stage) STEPFUNCTION_ARN = state_config.get_stage_state_machine_arn #SFN processing code #--------------------------------- executions = StatesInterface().list_state_executions( STEPFUNCTION_ARN, 'RUNNING', 50) current_sfn_exeuction = len(executions) logger.info(f"current_sfn_exeuctions:{current_sfn_exeuction}") if (current_sfn_exeuction < max_sfn_executions): sfn_available_slots = max_sfn_executions - current_sfn_exeuction logger.info(f"sfn_available_slots:{sfn_available_slots}") else: logger.info("No step function slot empty ----- exiting") return #----------------------------------- keys_to_process = [] #Dynamically manging workload from different priority queues #------------------------------------ for priority in ["HIGH", "LOW"]: #high=10 #low=30 sfn slots=40 print(priority) sqs_config = SQSConfiguration(team, dataset, stage, priority) queue_interface = SQSInterface(sqs_config.get_stage_queue_name_wlm) message_queue = queue_interface._message_queue num_messages_queue = int( message_queue.attributes['ApproximateNumberOfMessages']) logger.info( f"Number of messages in {message_queue}:{num_messages_queue}") if (num_messages_queue == 0): logger.info( f"Not enough messages in {message_queue}, trying next prority queue" ) else: if (num_messages_queue >= sfn_available_slots ): #Example 40>20 12>8 for high priority sqs num_messages_queue = sfn_available_slots keys_to_process = queue_interface.wlm_receive_max_messages( keys_to_process, num_messages_queue) logger.info(f"messages:{keys_to_process}") sfn_available_slots = sfn_available_slots - num_messages_queue logger.info(f"sfn_available_slots:{sfn_available_slots}") if (sfn_available_slots == 0): break #------------------------------------- #Running step function for processed messages #-------------------------------------- if (len(keys_to_process) > 0): for key in keys_to_process: response = { 'statusCode': 200, 'body': { "bucket": stage_bucket, "keysToProcess": [key], "team": team, "pipeline": pipeline, "pipeline_stage": stage, "dataset": dataset, "org": org, "app": app, "env": env } } StatesInterface().run_state_machine( state_config.get_stage_state_machine_arn, response) logger.info( f"{len(keys_to_process)} messages sent to step function ") else: logger.info(f"Not enough messages in any queue --- exiting") #---------------------------------------- #--------------------------------------- except Exception as e: # If failure send to DLQ if keys_to_process: for key in keys_to_process: dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) dlq_interface.send_message_to_fifo_queue( json.dumps(key), 'failed') logger.error("Fatal error", exc_info=True) raise e return
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ try: logger.info('Fetching event data from previous step') processed_keys = event['body']['processedKeys'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info('Storing metadata to DynamoDB') bucket = S3Configuration().stage_bucket for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'size': S3Interface().get_size(bucket, key), 'last_modified_date': S3Interface().get_last_modified(bucket, key), 'org': event['body']['org'], 'app': event['body']['app'], 'env': event['body']['env'], 'team': team, 'pipeline': pipeline, 'dataset': dataset, 'stage': 'stage', 'pipeline_stage': stage, 'peh_id': peh_id } dynamo_interface.update_object_metadata_catalog(object_metadata) #Workload management changes #--------------------------- wlm_ddb_table = dynamo_interface.wlm_control_table item = dynamo_interface.get_item( wlm_ddb_table, { "name": "{}-{}-{}".format(team, dataset, processed_keys[0].split("/")[-2]) }) priority = item.get('priority', None) print(priority) #--------------------------- logger.info('Sending messages to next SQS queue if it exists') sqs_config = SQSConfiguration(team, dataset, ''.join( [stage[:-1], chr(ord(stage[-1]) + 1)]), priority) #Workload management changes sqs_interface = SQSInterface( sqs_config.get_stage_queue_name_wlm) #Workload management changes sqs_interface.send_batch_messages_to_fifo_queue( processed_keys, 10, '{}-{}'.format(team, dataset)) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return 200
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with response """ def replace_decimals(obj): if isinstance(obj, list): for i in range(len(obj)): obj[i] = replace_decimals(obj[i]) return obj elif isinstance(obj, dict): for k, v in obj.items(): obj[k] = replace_decimals(v) return obj elif isinstance(obj, set): return set(replace_decimals(i) for i in obj) elif isinstance(obj, decimal.Decimal): if obj % 1 == 0: return int(obj) else: return float(obj) else: return obj def get_table_partitions(db, tbl): glue_response = glue_client.get_table(DatabaseName=db, Name=tbl) logger.debug('Glue get_table response: {}'.format(glue_response)) return glue_response['Table']['PartitionKeys'] try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] processed_keys_path = event['body']['job']['processedKeysPath'] processed_keys = S3Interface().list_objects(bucket, processed_keys_path) team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset1 = event['body']['dataset'] peh_id = event['body']['job']['peh_id'] env = event['body']['env'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info( 'Storing metadata to DynamoDB and tagging resulting S3 Objects') for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'size': S3Interface().get_size(bucket, key), 'last_modified_date': S3Interface().get_last_modified(bucket, key), 'org': event['body']['org'], 'app': event['body']['app'], 'env': event['body']['env'], 'team': team, 'pipeline': pipeline, 'dataset': dataset1, 'stage': 'stage', 'pipeline_stage': stage, 'peh_id': peh_id } dynamo_interface.update_object_metadata_catalog(object_metadata) tag_keys = ['org', 'app', 'env', 'team', 'dataset'] tag_dict = {key: object_metadata[key] for key in tag_keys} S3Interface().tag_object(bucket, key, tag_dict) # Only uncomment if a queue for the next stage exists # logger.info('Sending messages to next SQS queue if it exists') # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)])) # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name) # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset)) prestage_table = event['body']['dest_table']['name'] prestage_db = event['body']['dest_db'] dest_part_name = event['body']['dest_table']['part_name'] dest_part_value = event['body']['dest_table']['part_value'] processOutput = {} if dest_part_name is not '' and dest_part_value is not '': partitions = [] part_dict = {"name": dest_part_name, "value": dest_part_value} partitions.append(part_dict) processOutput['partitions'] = partitions processOutput['processed_keys'] = processed_keys ssmresponse = ssmcli.get_parameter( Name=f'/SDLF/DDB/{team}/{pipeline}/DependenciesByTable') ddb_dependencies_by_table = ssmresponse['Parameter']['Value'] ddb_table = dynamodb.Table(ddb_dependencies_by_table) ssmresponse = ssmcli.get_parameter( Name=f'/SDLF/DDB/{team}/{pipeline}/Dependencies') ddb_dependencies = ssmresponse['Parameter']['Value'] consulta = f'{prestage_db.lower()}.{prestage_table.lower()}' logger.info(consulta) response = ddb_table.get_item(Key={'table_name': consulta}) logger.info(f'Response {response}') if 'Item' in response: list_transforms = response['Item']['list_transforms'] num_of_transforms = len(list_transforms) logger.debug(f'Response {response}') logger.info(f'This table triggers {num_of_transforms} datasets') next_stage = 'B' stage_b_message = {} for dataset in list_transforms: ddb_steps = dynamodb.Table(ddb_dependencies) logger.info(dataset) response = ddb_steps.get_item(Key={'dataset': dataset}) logger.info(f'Response {response}') num_of_transforms = len(list_transforms) item = response['Item'] dest_table = item['dataset'].split('.')[1] dest_db = item['dataset'].split('.')[0] dependencies = item['dependencies'] date_substitutions = replace_decimals( item.get('date_substitutions', [])) logger.info(f'Dependencies: {dependencies}') partition = item.get('partitionColumn', '') partition_mask = item.get('partitionPythonMask', None) partition_value_formatted = None table_check = [] for table in dependencies: table_name = table['TableName'].split('.')[1] table_db = table['TableName'].split('.')[0] table_partition = table.get('FieldColumn', '') table_partition_format = table.get('DateExpression', None) relativedelta_attributes = replace_decimals( table.get('relativedelta_attributes', None)) table_partitions = processOutput.get('partitions', []) usage = table.get('Usage', 'validate').lower() if usage == 'validate': if prestage_db == table_db and prestage_table == table_name: logger.info( f'This table does not update/overwrite {dataset} dataset' ) break else: logger.debug( f'Table {table_db}.{table_name} is not the trigger table' ) else: if prestage_db.lower() == table_db.lower( ) and prestage_table.lower() == table_name.lower(): # dst_tbl_partitions = get_table_partitions(prestage_db,prestage_table) partition_value_formatted = '' # If dest table has partitions and source table has partitions logger.debug( f'Partition: {partition}, table_partitions: {table_partitions}' ) if table_partitions and table_partition_format is not None: table_partition_value = table_partitions[0][ 'value'] value = datetime.strptime( table_partition_value, table_partition_format) target_value = value - relativedelta( **relativedelta_attributes) partition_value_formatted = target_value.strftime( partition_mask) logger.info( f'This table {usage.upper()} dataset {dest_table} ' f' Partition {partition} = {partition_value_formatted}' ) # validate(table_db, table_name, table_partitions) stage_b_message[ 'prev_stage_processed_keys'] = processed_keys stage_b_message['team'] = team stage_b_message['pipeline'] = pipeline stage_b_message['pipeline_stage'] = ''.join( [stage[:-1], next_stage]) stage_b_message['dataset'] = dataset1 stage_b_message['org'] = event['body']['org'] stage_b_message['app'] = event['body']['app'] stage_b_message['env'] = event['body']['env'] stage_b_message['behaviour'] = table[ 'Usage'].lower() stage_b_message['dest_db'] = dest_db stage_b_message['dest_table'] = {} stage_b_message['dest_table']['name'] = dest_table stage_b_message['dest_table'][ 'part_name'] = partition stage_b_message['dest_table'][ 'part_value'] = partition_value_formatted stage_b_message['steps'] = item['steps'] stage_b_message[ 'date_substitutions'] = date_substitutions logger.info( 'Sending messages to next SQS queue if it exists' ) # GEt queue by SSM logger.info(stage_b_message) sqs_config = SQSConfiguration( team, pipeline, stage) sqs_interface = SQSInterface( sqs_config.get_stage_queue_name) sqs_interface.send_message_to_fifo_queue( json.dumps(stage_b_message), '{}-{}'.format(team, pipeline)) break else: logger.info(f'This table triggers 0 datasets') octagon_client.update_pipeline_execution( status=f'{stage} {component} Processing', component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment=f'{stage} {component} Error: {repr(e)}') raise e return 200
def lambda_handler(event, context): """Checks if any items need processing and triggers state machine Arguments: event {dict} -- Dictionary with no relevant details context {dict} -- Dictionary with details on Lambda context """ # TODO Implement Redrive Logic (through message_group_id) try: team = event['team'] pipeline = event['pipeline'] stage = event['pipeline_stage'] dataset = event['dataset'] org = event['org'] app = event['app'] env = event['env'] stage_bucket = S3Configuration().stage_bucket dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) transform_info = dynamo_interface.get_transform_table_item( '{}-{}'.format(team, dataset)) MIN_ITEMS_TO_PROCESS = int( transform_info['min_items_process']['stage_{}'.format( stage[-1].lower())]) MAX_ITEMS_TO_PROCESS = int( transform_info['max_items_process']['stage_{}'.format( stage[-1].lower())]) sqs_config = SQSConfiguration(team, dataset, stage) queue_interface = SQSInterface(sqs_config.get_stage_queue_name) keys_to_process = [] logger.info('Querying {}-{} objects waiting for processing'.format( team, dataset)) keys_to_process = queue_interface.receive_min_max_messages( MIN_ITEMS_TO_PROCESS, MAX_ITEMS_TO_PROCESS) # If no keys to process, break if not keys_to_process: return logger.info('{} Objects ready for processing'.format( len(keys_to_process))) keys_to_process = list(set(keys_to_process)) response = { 'statusCode': 200, 'body': { "bucket": stage_bucket, "keysToProcess": keys_to_process, "team": team, "pipeline": pipeline, "pipeline_stage": stage, "dataset": dataset, "org": org, "app": app, "env": env } } logger.info('Starting State Machine Execution') state_config = StateMachineConfiguration(team, pipeline, stage) StatesInterface().run_state_machine( state_config.get_stage_state_machine_arn, response) except Exception as e: # If failure send to DLQ if keys_to_process: dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name) dlq_interface.send_message_to_fifo_queue(json.dumps(response), 'failed') logger.error("Fatal error", exc_info=True) raise e return