def delete_glue_database(database_name): try: glue_client.delete_database(Name=database_name) ul.log( info=f'Glue database {database_name} has been successfully deleted' ) except Exception as exc: ul.log(error=exc)
def create_glue_database(database_definition): try: glue_client.create_database(DatabaseInput=database_definition) ul.log( info= f'Glue database {database_definition["Name"]} has been successfully created' ) except Exception as exc: ul.log(error=exc)
def create_glue_table(database_name, table_definition): try: glue_client.create_table(DatabaseName=database_name, TableInput=table_definition) ul.log( info= f'Glue table {table_definition["Name"]} has been successfully created' ) except Exception as exc: ul.log(error=exc)
def create_error_vault_database_table_from_vault_database_table( vault_bucket, vault_database_name, vault_table_name): error_vault_database_name = get_error_vault_database_name( vault_database_name) error_vault_table_name = get_error_table_name(vault_table_name) try: if get_glue_database_definition(error_vault_database_name) is None: create_error_vault_database_from_vault_database( vault_database_name) response = get_glue_table_definition(vault_database_name, vault_table_name) response['Table']['Name'] = error_vault_table_name response['Table']['StorageDescriptor'][ 'Location'] = get_error_table_s3_uri(vault_bucket, vault_table_name) columns = [{ 'Name': 'error_load_ts', 'Type': 'timestamp', 'Comment': 'loading timestamp of error rows' }, { 'Name': 'error_column', 'Type': 'string', 'Comment': 'column name which has error value ' }, { 'Name': 'error_value', 'Type': 'string', 'Comment': 'error value ' }, { 'Name': 'error_description', 'Type': 'string', 'Comment': 'error description for rejected rows' }, { 'Name': 'stage_path', 'Type': 'string', 'Comment': 'data source' }] for column in columns: response['Table']['StorageDescriptor']['Columns'].append(column) keys_to_remove = [ 'DatabaseName', 'CreateTime', 'UpdateTime', 'CreatedBy', 'IsRegisteredWithLakeFormation', 'CatalogId' ] for key in keys_to_remove: del response['Table'][key] # print('new_table_definition: ', response['Table']) create_glue_table(error_vault_database_name, response['Table']) except Exception as exc: ul.log(error=exc)
def get_s3_event_bucket_and_key(event): """ get_event_bucket_and_path return a tuple with bucket and key from the event """ ul.log(event=event) if event['Records'][0]['EventSource'] == 'aws:sns': event = json.loads(event['Records'][0]['Sns']['Message']) ul.log(event=event) result = jmespath.search( 'Records[*].s3.{ Bucket : bucket.name, Key : object.key }|[0]', event) # ul.log(result=result) return (result["Bucket"], result['Key'])
def send_msg_list_to_sqs(queue_url, msg_list): """ send_msg_list_to_sqs sends a list of messages to sqs queue breaking the list into batches of suitable size """ assert isinstance(msg_list, Iterable) items_per_batch = calc_avg_items_per_batch( item_count=len(msg_list), batch_size=get_batch_bytes(), total_size=size_in_json(msg_list)) ul.log(avg_items_per_batch=items_per_batch, queue_url=queue_url) for batch in generate_json_item_batches(msg_list, items_per_batch, get_batch_bytes()): send_msg_obj_to_sqs(queue_url, batch)
def create_error_vault_database_from_vault_database(vault_database_name): error_vault_database_name = get_error_vault_database_name( vault_database_name) if get_glue_database_definition(error_vault_database_name) is None: try: response = get_glue_database_definition(vault_database_name) response['Database']['Name'] = error_vault_database_name response['Database']['Description'] = 'Data Lake Errors' keys_to_remove = ['CreateTime', 'CatalogId'] for key in keys_to_remove: del response['Database'][key] # print('vault_database_definition: ', response['Database']) create_glue_database(response['Database']) except Exception as exc: ul.log(error=exc) else: ul.log(error=f'{error_vault_database_name} already exists')
def send_batches_to_sqs(queue_url, batches): """ send_batches_to_sqs sends each batch in baches to sqs queue clearly requires batches to be split into suitable size for sqs already """ ul.log(status='starting send batches', queue_url=queue_url) for batch_id, batch in enumerate(batches): ul.log(batch=batch_id) response = send_msg_obj_to_sqs( queue_url=queue_url, msg_obj=batch, ) if response is not None: ul.log( Error='Failed to send batch to queue', batch_id=batch_id, httpstatuscode=response["ResponseMetadata"]["HTTPStatusCode"], queue_url=queue_url, response=response, message=batch)
def delete_glue_table(database_name, table_name): try: glue_client.delete_table(DatabaseName=database_name, Name=table_name) ul.log(info=f'Glue table {table_name} has been successfully deleted') except Exception as exc: ul.log(error=exc)