def _get_manifests_to_process(self, event): try: batch_id = event.get("BatchId") data = dict() data["jamurl"] = "" data["alerturl"] = "" data["irregularityurl"] = "" data["irregularity_alerturl"] = "" data["irregularity_jamurl"] = "" data["irregularity_point_sequenceurl"] = "" data["jam_point_sequenceurl"] = "" data["queueUrl"] = event.get("queueUrl") data["receiptHandle"] = event.get("receiptHandle") if batch_id is None: return data dynamodb = boto3.resource('dynamodb', region_name='us-east-1') table = dynamodb.Table(os.environ["CURATION_MANIFEST_TABLE"]) response = table.query( IndexName="dev-BatchId-TableName-index", KeyConditionExpression=Key('BatchId').eq(batch_id), FilterExpression=Attr('FileStatus').eq('open')) data["batchId"] = batch_id for item in response['Items']: url = item["TableName"] + "url" data[url] = item["ManifestS3Key"] return data except Exception as e: LoggerUtility.log_error("Error getting manifests for batches") raise e
def handle_bucket_event(self, event): LoggerUtility.set_level() bucket_name, object_key = self.fetch_s3_details_from_event(event) s3_head_object = self.get_s3_head_object(bucket_name, object_key) metadata = self.create_metadata_object(s3_head_object, object_key) self.push_metadata_to_elasticsearch(bucket_name, metadata) self.publish_custom_metrics_to_cloudwatch(bucket_name, metadata)
def persist_curated_datasets(event, batch_id): LoggerUtility.set_level() table_name = event["tablename"] manifest_url_parameter = table_name + "url" manifest_url = event[manifest_url_parameter] is_historical = event["is_historical"] == 'true' sql_file_name = FUNCTION_LOGIC + "_" + table_name + ".sql" __persist_records_to_redshift(manifest_url, table_name, sql_file_name, batch_id, is_historical)
def get_latest_batch(self, latest_batch_id): try: ssm = boto3.client('ssm', region_name='us-east-1') response = ssm.get_parameter(Name=latest_batch_id, WithDecryption=False) LoggerUtility.log_info("Response from parameter store - {}".format(response)) current_batch_id = response["Parameter"]["Value"] except Exception as ex: LoggerUtility.log_error("Unable to get latest batch with reason - {}".format(ex)) raise ex return current_batch_id
def push_batch_id_to_queue(self, current_batch_id): try: sqs = boto3.resource('sqs', region_name='us-east-1') queue_name = os.environ["SQS_CURATED_BATCHES_QUEUE_ARN"].rsplit(':', 1)[1] curated_batches_queue = sqs.get_queue_by_name(QueueName=queue_name) curated_batches_queue.send_message( MessageBody=json.dumps({'BatchId': current_batch_id}), MessageGroupId="WazeCuratedBatchesMessageGroup" ) LoggerUtility.log_info("Successfully pushed the message to queue for batchid - {}".format(current_batch_id)) except Exception as ex: LoggerUtility.log_error("Failed to push the batch to queue - {}".format(ex)) raise ex
def __init__(self, user, password, redshift_jdbc_url): endpoint_and_rest = redshift_jdbc_url.split('://')[1].split(':') endpoint = endpoint_and_rest[0] port_and_dbname = endpoint_and_rest[1].split('/') port = port_and_dbname[0] dbname = port_and_dbname[1] self.connection = psycopg2.connect(database=dbname, port=port, host=endpoint, password=password, user=user) self.connection.set_session(autocommit=True) self.cursor = self.connection.cursor() LoggerUtility.log_info("Established connection successfully")
def create_metadata_object(self, s3_head_object, key): metadata = { Constants.KEY_REFERENCE: key, Constants.CONTENT_LENGTH_REFERENCE: s3_head_object[Constants.CONTENT_LENGTH_REFERENCE], Constants.SIZE_MIB_REFERENCE: s3_head_object[Constants.CONTENT_LENGTH_REFERENCE] / 1024**2, Constants.LAST_MODIFIED_REFERENCE: s3_head_object[Constants.LAST_MODIFIED_REFERENCE].isoformat(), Constants.CONTENT_TYPE_REFERENCE: s3_head_object[Constants.CONTENT_TYPE_REFERENCE], Constants.ETAG_REFERENCE: s3_head_object[Constants.ETAG_REFERENCE], Constants.DATASET_REFERENCE: key.split('/')[0], Constants.ENVIRONMENT_NAME: os.environ["ENVIRONMENT_NAME"] } if key.split('/')[0] == "waze": if 'type' in key: type_value = key.split('/type=')[1].split('/')[0] type_metadata = {Constants.TRAFFIC_TYPE_REFERENCE: type_value} metadata.update(type_metadata) if 'table' in key: table_value = key.split('/table=')[1].split('/')[0] table_metadata = {Constants.TABLE_NAME_REFERENCE: table_value} metadata.update(table_metadata) if 'state' in key: state_value = key.split('/state=')[1].split('/')[0] state_metadata = {Constants.STATE_REFERENCE: state_value} metadata.update(state_metadata) elif key.split('/')[0] == "cv": data_provider_type_value = key.split('/')[1] data_provider_type_metadata = { Constants.DATA_PROVIDER_REFERENCE: data_provider_type_value } metadata.update(data_provider_type_metadata) data_type_value = key.split('/')[2] data_type_metadata = { Constants.DATA_TYPE_REFERENCE: data_type_value } metadata.update(data_type_metadata) LoggerUtility.log_info("METADATA: " + str(metadata)) return metadata
def create_new_batch_id(self, latest_batch_id): new_batch_id = str(int(time.time())) try: ssm = boto3.client('ssm', region_name='us-east-1') ssm.put_parameter( Name=latest_batch_id, Description='Parameter to hold the latest value of a batch used for processing waze transactions', Value=new_batch_id, Type='String', Overwrite=True, AllowedPattern='\\d+') LoggerUtility.log_info("Successfully created a new batch with id - {}".format(new_batch_id)) except Exception as ex: LoggerUtility.log_error("Failed to create new batch with reason - {}".format(ex)) raise ex return new_batch_id
def register_kibana_dashboard(self): LoggerUtility.set_level() try: es_endpoint = os.environ[Constants.ES_ENDPOINT_ENV_VAR] except KeyError as e: LoggerUtility.log_error(str(e) + " not configured") LoggerUtility.log_error("Failed to register kibana dashboard") raise e es_client = ElasticsearchClient.get_client(es_endpoint) try: self._create_metadata_visualizations(es_client) except ElasticsearchException as e: LoggerUtility.log_error(e) LoggerUtility.log_error("Failed to register kibana dashboard") raise e
def execute_from_file(self, file_name, **query_kwargs): LoggerUtility.log_info("Filename - {}, Role - {}".format( file_name, self.redshift_role_arn)) query = self.query_loader.load_from_file( file_name, region_name=self.region_name, redshift_role_arn=self.redshift_role_arn, **query_kwargs) LoggerUtility.log_info("Query details - {}".format(query)) LoggerUtility.log_info("Executing redshift copy command") self.redshift_connection.execute(query) LoggerUtility.log_info("Completed redshift copy command")
def put_message_sqs(self, batch_id, sqs_persist): """ Puts a batch into a queue via Amazon's Simple Queue Service :param batch_id: the batch id of the batch :param sqs_persist: the name of the queue :return: """ try: queue = self.sqs.get_queue_by_name(QueueName=sqs_persist) response = queue.send_message(MessageBody=json.dumps({ 'BatchId': batch_id })) LoggerUtility.log_info( "Successfully put message to persist sqs for batch id - {}, response - {}".format(batch_id, response)) except Exception as e: LoggerUtility.log_error( "Unable to put message to persist sqs for batch id - {} , sqs - {}".format(batch_id, sqs_persist)) raise e
def get_s3_head_object(self, bucket_name, object_key): """ :param bucket_name: :param object_key: :return: """ s3_client = boto3.client('s3', region_name='us-east-1') try: response = s3_client.head_object(Bucket=bucket_name, Key=object_key) except ClientError as e: LoggerUtility.log_error(e) LoggerUtility.log_error( 'Error getting object {} from bucket {}. Make sure they exist, ' 'your bucket is in the same region as this function and necessary permissions ' 'have been granted.'.format(object_key, bucket_name)) raise e else: return response
def delete_sqs_message(self, event, context): """ Moves a message to the persistence queue, then deletes it from the previous queue via Amazon's Simple Queue Service. :param event: a list with a dictionary that contains information on a batch :param context: not used. Logging it :return: """ LoggerUtility.log_info("context: {}".format(context)) batch_id = "" try: if "queueUrl" in event[0]: queue_url = event[0]["queueUrl"] receipt_handle = event[0]["receiptHandle"] batch_id = event[0]["batch_id"] is_historical = event[0]["is_historical"] == "true" persistence_queue = os.environ['SQS_PERSIST_ARN'] if is_historical: persistence_queue = os.environ['SQS_PERSIST_HISTORICAL_ARN'] # put the message into the persistence queue via batchId self.put_message_sqs(batch_id, persistence_queue) txt = json.dumps(event[0]) # delete message from the previous queue. if json.loads(txt).get("queueUrl") is not None: message = self.sqs.Message(queue_url, receipt_handle) message.delete() LoggerUtility.log_info("Message deleted from sqs for batchId {}".format(batch_id)) self.publish_message_to_sns({"BatchId": batch_id, "Status": "Manifest generation completed"}) except Exception as e: LoggerUtility.log_error("Unable to delete sqs message for batchId {}".format(batch_id)) raise e
def fetch_s3_details_from_event(self, event): """ Pull bucket name and key from an event. :param event: Json object :return: bucket, key """ try: sns_message = json.loads(event["Records"][0]["Sns"]["Message"]) bucket = sns_message["Records"][0]["s3"]["bucket"]["name"] key = urllib.parse.unquote_plus( sns_message["Records"][0]["s3"]["object"]["key"]) except Exception as e: LoggerUtility.log_error(str(e)) LoggerUtility.log_error("Failed to process the event") raise e else: LoggerUtility.log_info("Bucket name: " + bucket) LoggerUtility.log_info("Object key: " + key) return bucket, key
def create_batch(self): LoggerUtility.set_level() LoggerUtility.log_info("Initiating batch creation process") latest_batch_id = os.environ["LATEST_BATCH_ID"] current_batch_id = self.get_latest_batch(latest_batch_id) if "" == current_batch_id: new_batch_id = self.create_new_batch_id(latest_batch_id) else: current_batch_id = self.get_latest_batch(latest_batch_id) self.push_batch_id_to_queue(current_batch_id) new_batch_id = self.create_new_batch_id(latest_batch_id) LoggerUtility.log_info("Completed batch creation process with batch id - {}".format(new_batch_id))
def __persist_records_to_redshift(manifest_s3key_name, table_name, sql_file_name, batch_id, is_historical): """ :param manifest_s3key_name: :param table_name: :param sql_file_name: :param batch_id: :param is_historical: :return: """ try: s3_resource = boto3.resource('s3') LoggerUtility.log_info("Started persistence for table_name - {}".format(table_name)) curated_bucket = os.environ['CURATED_BUCKET_NAME'] LoggerUtility.log_info("Manifest s3 key = {}".format(manifest_s3key_name)) redshift_manager = __make_redshift_manager() # Download the file from S3 to REDSHIFT_SQL_DIR path query_file_temp_name = str(uuid.uuid4()) + sql_file_name s3_resource.Bucket(CONFIG_BUCKET).download_file(SQL_KEY_PREFIX + "/" + sql_file_name, REDSHIFT_SQL_DIR + "/" + query_file_temp_name) LoggerUtility.log_info("Downloaded file from S3 - {}".format(query_file_temp_name)) dw_schema_name = "dw_waze" elt_schema_name = "elt_waze" if is_historical: dw_schema_name = "dw_waze_history" elt_schema_name = "elt_waze_history" redshift_manager.execute_from_file(query_file_temp_name, curated_bucket_name=curated_bucket, manifest_curated_key=manifest_s3key_name, batchIdValue=batch_id, dw_schema_name=dw_schema_name, elt_schema_name=elt_schema_name) # delete the file once executed os.remove(REDSHIFT_SQL_DIR + "/" + query_file_temp_name) except Exception as e: LoggerUtility.log_info("Failed to persist curated data to redshift for table " "name - {} with exception - {}".format(table_name, e)) raise
def push_metadata_to_elasticsearch(self, bucket_name, metadata): try: elasticsearch_endpoint = os.environ[Constants.ES_ENDPOINT_ENV_VAR] except KeyError as e: LoggerUtility.log_error(str(e) + " not configured") raise e es_client = ElasticsearchClient.get_client(elasticsearch_endpoint) try: es_client.index(index=Constants.DEFAULT_INDEX_ID, doc_type=bucket_name, body=json.dumps(metadata)) except ElasticsearchException as e: LoggerUtility.log_error(e) LoggerUtility.log_error("Could not index in Elasticsearch") raise e
def poll_for_batches(self, event, context): """ gets the messages from the data persistence queue to start the persistence in Redshift :param event: a dictionary, or a list of a dictionary, that contains information on a batch :param context: Not used :return: """ LoggerUtility.log_info("Context: {}".format(context)) try: sqs = boto3.resource('sqs', region_name='us-east-1') is_historical = event["is_historical"] == "true" persist_sqs = os.environ["persistence_sqs"] if is_historical: persist_sqs = os.environ["persistence_historical_sqs"] queue = sqs.get_queue_by_name(QueueName=persist_sqs) data = dict() data["is_historical"] = str(is_historical).lower() # if no batch id assigned, gather BatchId, queueUrl, and receiptHandle from the messages in the queue. if 'BatchId' not in event: for message in queue.receive_messages(): json_body = json.loads(message.body) data["BatchId"] = json_body["BatchId"] data["queueUrl"] = message.queue_url data["receiptHandle"] = message.receipt_handle LoggerUtility.log_info( "Batch {} retrieved for processing".format( json_body["BatchId"])) break # Otherwise, only assign the BatchId from the event. else: data["BatchId"] = event['BatchId'] if 'BatchId' in data: self.publish_message_to_sns({ "BatchId": data["BatchId"], "Status": "Persistence process started" }) return data except Exception as e: LoggerUtility.log_error("Error polling for batches") raise e
def push_batch_id_to_nightly_sqs_queue(self, event, context): LoggerUtility.log_info("context: {}".format(context)) current_batch_id = "" try: if "batchId" in event[0]: sqs = boto3.resource('sqs', region_name='us-east-1') current_batch_id = event[0]["batchId"] nightly_queue_name = os.environ["SQS_NIGHTLY_PERSISTENCE_QUEUE_NAME"] nightly_batches_queue = sqs.get_queue_by_name(QueueName=nightly_queue_name) response = nightly_batches_queue.send_message(MessageBody=json.dumps({ 'BatchId': current_batch_id }), MessageGroupId="WazeNightlyPersistenceBatchesMessageGroup") LoggerUtility.log_info("Successfully pushed the message to nightly queue for batch_id -" " {} with response - {}".format(current_batch_id, response)) except Exception as e: LoggerUtility.log_error( "Unable to push sqs message to nightly queue for batchId {}".format(current_batch_id)) raise e
def delete_sqs_message(self, event, context): LoggerUtility.log_info("context: {}".format(context)) batch_id = "" try: if "queueUrl" in event[0] and "batchId" in event[0]: sqs = boto3.resource('sqs', region_name='us-east-1') queue_url = event[0]["queueUrl"] receipt_handle = event[0]["receiptHandle"] batch_id = event[0]["batchId"] txt = json.dumps(event[0]) if json.loads(txt).get("queueUrl") is not None: message = sqs.Message(queue_url, receipt_handle) message.delete() LoggerUtility.log_info("Message deleted from sqs for batchId {}".format(batch_id)) self.publish_message_to_sns({"BatchId": batch_id, "Status": "Persistence process completed"}) except Exception as e: LoggerUtility.log_error("Unable to delete sqs message for batchId {}".format(batch_id)) raise e
def get_client(elasticsearch_endpoint): LoggerUtility.set_level() try: awsauth = AWSRequestsAuth( aws_access_key=os.environ[Constants.ACCESS_KEY_ENV_VAR], aws_secret_access_key=os.environ[Constants.SECRET_KEY_ENV_VAR], aws_token=os.environ[Constants.SESSION_TOKEN_ENV_VAR], aws_host=elasticsearch_endpoint, aws_region=os.environ[Constants.REGION_ENV_VAR], aws_service=Constants.ELASTICSEARCH_SERVICE_CLIENT ) except KeyError as e: LoggerUtility.log_error(str(e) + " not configured") LoggerUtility.log_error("Failed to register kibana dashboard") raise e return Elasticsearch( hosts=['{0}:443'.format(elasticsearch_endpoint)], use_ssl=True, connection_class=RequestsHttpConnection, http_auth=awsauth )
def persist_record_to_dynamodb_table(s3_key, table_name, state, num_records, bucket, batch_id, is_historical, month, year): try: dynamodb_curated_records_table_name = os.environ[ 'DDB_CURATED_RECORDS_TABLE_ARN'].split('/')[1] persist_records = bool(int(os.environ['PERSIST_RECORDS'])) s3_key = "s3://" + bucket + "/" + s3_key if persist_records: dynamodb = boto3.resource('dynamodb') curated_record_table = dynamodb.Table( dynamodb_curated_records_table_name) response = curated_record_table.put_item( Item={ 'CurationRecordId': str(uuid.uuid4()), 'BatchId': batch_id, 'DataTableName': table_name, 'S3Key': s3_key, 'State': state, 'TotalNumCuratedRecords': num_records, 'IsHistorical': is_historical, 'Year': year, 'Month': month }) LoggerUtility.log_info( "Successfully persisted record to dynamo db table - {}". format(response)) else: LoggerUtility.log_info( "Persist records flag is disabled, so not persisting " "any records to dynamodb table") except Exception as e: LoggerUtility.log_error( "Failed to persist record to dynamo db table for key - {}". format(s3_key)) raise e
def update_manifest_status(self, event, context): LoggerUtility.log_info("context: {}".format(context)) batch_id = "" table_name = "" try: session = boto3.session.Session() ddb = session.resource('dynamodb', region_name='us-east-1') ddb_table_name = os.environ['DDB_MANIFEST_TABLE_ARN'].split('/')[1] manifest_index_name = os.environ['DDB_MANIFEST_FILES_INDEX_NAME'] table_name = event['tablename'] batch_id = event['batchId'] ddb_table = ddb.Table(ddb_table_name) response = ddb_table.query( IndexName=manifest_index_name, KeyConditionExpression=Key('BatchId').eq(batch_id) & Key('TableName').eq(table_name), FilterExpression=Attr('FileStatus').eq('open')) if response['Count'] > 0: for item in response['Items']: if table_name == item['TableName']: ddb_table.update_item( Key={ 'ManifestId': item['ManifestId'], 'BatchId': batch_id }, UpdateExpression='set FileStatus = :f', ExpressionAttributeValues={':f': 'completed'}) LoggerUtility.log_error( "Updated manifest status for batchId {} and table {}" .format(batch_id, table_name)) break except Exception as e: LoggerUtility.log_error( "Unable to update manifest status for batchId {} and table {}" .format(batch_id, table_name)) raise e
def lambda_handler(event, context): LoggerUtility.set_level() update_manifest_handle_event = ManifestHandler() update_manifest_handle_event.update_manifest(event, context) return event
def lambda_handler(event, context): LoggerUtility.set_level() get_batches_handle_event = SqsHandler() return get_batches_handle_event.get_batches(event, context)
class ClosePipeline: sqs = boto3.resource('sqs', region_name='us-east-1') sns = boto3.client('sns', region_name='us-east-1') LoggerUtility.log_info("Test test 123") def publish_message_to_sns(self, message): """ Publishes a message to Amazon's Simple Notification Service :param message: dict """ self.sns.publish( TargetArn=os.environ['BATCH_NOTIFICATION_SNS'], Message=json.dumps({'default': json.dumps(message)}), MessageStructure='json' ) def put_message_sqs(self, batch_id, sqs_persist): """ Puts a batch into a queue via Amazon's Simple Queue Service :param batch_id: the batch id of the batch :param sqs_persist: the name of the queue :return: """ try: queue = self.sqs.get_queue_by_name(QueueName=sqs_persist) response = queue.send_message(MessageBody=json.dumps({ 'BatchId': batch_id })) LoggerUtility.log_info( "Successfully put message to persist sqs for batch id - {}, response - {}".format(batch_id, response)) except Exception as e: LoggerUtility.log_error( "Unable to put message to persist sqs for batch id - {} , sqs - {}".format(batch_id, sqs_persist)) raise e def delete_sqs_message(self, event, context): """ Moves a message to the persistence queue, then deletes it from the previous queue via Amazon's Simple Queue Service. :param event: a list with a dictionary that contains information on a batch :param context: not used. Logging it :return: """ LoggerUtility.log_info("context: {}".format(context)) batch_id = "" try: if "queueUrl" in event[0]: queue_url = event[0]["queueUrl"] receipt_handle = event[0]["receiptHandle"] batch_id = event[0]["batch_id"] is_historical = event[0]["is_historical"] == "true" persistence_queue = os.environ['SQS_PERSIST_ARN'] if is_historical: persistence_queue = os.environ['SQS_PERSIST_HISTORICAL_ARN'] # put the message into the persistence queue via batchId self.put_message_sqs(batch_id, persistence_queue) txt = json.dumps(event[0]) # delete message from the previous queue. if json.loads(txt).get("queueUrl") is not None: message = self.sqs.Message(queue_url, receipt_handle) message.delete() LoggerUtility.log_info("Message deleted from sqs for batchId {}".format(batch_id)) self.publish_message_to_sns({"BatchId": batch_id, "Status": "Manifest generation completed"}) except Exception as e: LoggerUtility.log_error("Unable to delete sqs message for batchId {}".format(batch_id)) raise e def close_pipeline(self, event, context): """ Executes delete_sqs_message :param event: a list with a dictionary that contains information on a batch :param context: not used. Just passed to :return: """ self.delete_sqs_message(event, context)
def lambda_handler(event, context): LoggerUtility.set_level() get_manifests_handle_event = ManifestHandler() return get_manifests_handle_event.get_manifests(event)
def publish_custom_metrics_to_cloudwatch(self, bucket_name, metadata): cloudwatch_client = boto3.client('cloudwatch', region_name='us-east-1') try: if bucket_name == os.environ[ "SUBMISSIONS_BUCKET_NAME"] and metadata[ "Dataset"] == "waze": cloudwatch_client.put_metric_data( Namespace=os.environ["WAZE_SUBMISSIONS_COUNT_METRIC"], MetricData=[ { 'MetricName': 'Counts by state and traffic type', 'Dimensions': [{ 'Name': 'State', 'Value': metadata["State"] }, { 'Name': 'TrafficType', 'Value': metadata["TrafficType"] }], 'Value': 1, 'Unit': 'Count' }, ]) if metadata["ContentLength"] <= 166: cloudwatch_client.put_metric_data( Namespace=os. environ["WAZE_ZERO_BYTE_SUBMISSIONS_COUNT_METRIC"], MetricData=[ { 'MetricName': 'Zero Byte Submissions by State and traffic type', 'Dimensions': [{ 'Name': 'State', 'Value': metadata["State"] }, { 'Name': 'TrafficType', 'Value': metadata["TrafficType"] }], 'Value': 1, 'Unit': 'Count' }, ]) elif bucket_name == os.environ[ "SUBMISSIONS_BUCKET_NAME"] and metadata["Dataset"] == "cv": cloudwatch_client.put_metric_data( Namespace=os.environ["CV_SUBMISSIONS_COUNTS_METRIC"], MetricData=[ { 'MetricName': 'Counts by provider and datatype', 'Dimensions': [{ 'Name': 'DataProvider', 'Value': metadata["DataProvider"] }, { 'Name': 'DataType', 'Value': metadata["DataType"] }], 'Value': 1, 'Unit': 'Count' }, ]) elif bucket_name == os.environ["CURATED_BUCKET_NAME"] and metadata[ "Dataset"] == "waze": cloudwatch_client.put_metric_data( Namespace=os.environ["WAZE_CURATED_COUNTS_METRIC"], MetricData=[ { 'MetricName': 'Counts by state and table name', 'Dimensions': [{ 'Name': 'State', 'Value': metadata["State"] }, { 'Name': 'TableName', 'Value': metadata["TableName"] }], 'Value': 1, 'Unit': 'Count' }, ]) except Exception as e: LoggerUtility.log_error(e) LoggerUtility.log_error( "Failed to publish custom cloudwatch metrics") raise e
def lambda_handler(event, context): LoggerUtility.set_level() close_pipeline_handle_event = ClosePipeline() close_pipeline_handle_event.close_pipeline(event, context) return event