def handler(event, context): # pylint: disable=unused-argument """Lambda entry point Event keys: prefix(string): Common prefix of S3 keys to be sent to queue """ if "queue" in event: # Consume payload data from job queue, one job only response = get_client("sqs").receive_message(QueueUrl=event["queue"]) receipt_handle = response["Messages"][0]["ReceiptHandle"] populate_queue_with_quicklooks( bucket=os.environ["CBERS_PDS_BUCKET"], prefix=response["Messages"][0]["Body"], suffix=r"\.(jpg|png)", queue=os.environ["NEW_SCENES_QUEUE"], ) # r_params.append(json.loads(response['Messages'][0]['Body'])) # print(json.dumps(response, indent=2)) get_client("sqs").delete_message(QueueUrl=event["queue"], ReceiptHandle=receipt_handle) else: # Lambda called from SQS trigger for record in event["Records"]: populate_queue_with_quicklooks( bucket=os.environ["CBERS_PDS_BUCKET"], prefix=record["body"], suffix=r"\.(jpg|png)", queue=os.environ["NEW_SCENES_QUEUE"], )
def populate_queue_with_subdirs(bucket: str, prefix: str, queue: str) -> None: """ Populate queue with messages containing S3 keys from 'prefix', grouped by the first occurrence of '/' after 'prefix'. If is required that the prefix ends with '/', which means that all the subdirs will be scanned. Input: bucket(string): STAC bucket prefix(string): ditto. queue(string): queue url """ # No reason to run the function without scanning subdirs assert prefix[-1] == "/" dirs = get_client("s3").list_objects_v2( Bucket=bucket, Prefix=prefix, Delimiter="/", ) # Paging is not supported here assert not dirs["IsTruncated"] for dir_key in dirs["CommonPrefixes"]: LOGGER.info(dir_key["Prefix"]) get_client("sqs").send_message(QueueUrl=queue, MessageBody=dir_key["Prefix"])
def consume_stac_reconcile_queue_handler(event, context): # pylint: disable=unused-argument """Lambda entry point. Event keys: prefix(string): Common prefix of STAC items to be sent to queue queue(string): URL of the queue, optional. If set a single message is read """ if "queue" in event: # Consume payload data from job queue, one job only response = get_client("sqs").receive_message(QueueUrl=event["queue"]) receipt_handle = response["Messages"][0]["ReceiptHandle"] send_stac_items_to_queue( bucket=os.environ["CBERS_STAC_BUCKET"], prefix=response["Messages"][0]["Body"], queue=os.environ["insert_into_elasticsearch_queue_url"], ) # r_params.append(json.loads(response['Messages'][0]['Body'])) # print(json.dumps(response, indent=2)) get_client("sqs").delete_message(QueueUrl=event["queue"], ReceiptHandle=receipt_handle) else: # Lambda called from SQS trigger for record in event["Records"]: send_stac_items_to_queue( bucket=os.environ["CBERS_STAC_BUCKET"], prefix=record["body"], queue=os.environ["insert_into_elasticsearch_queue_url"], )
def populate_queue_with_quicklooks(bucket, prefix, suffix, queue): """ Populate queue with items to be processed. The items are obtained from bucket/prefix/*/suffix """ suffix = r".*" + suffix files = get_client("s3").list_objects_v2(Bucket=bucket, Prefix=prefix, RequestPayer="requester") while True: for file in files["Contents"]: if re.search(suffix, file["Key"]): # print(file['Key']) message = dict() message["Message"] = json.dumps({ "Records": [{ "s3": { "object": { "key": file["Key"], "reconcile": 1 } } }] }) get_client("sqs").send_message(QueueUrl=queue, MessageBody=json.dumps(message)) if not files["IsTruncated"]: break files = get_client("s3").list_objects_v2( Bucket=bucket, Prefix=prefix, ContinuationToken=files["NextContinuationToken"], RequestPayer="requester", )
def process_queue( # pylint: disable=too-many-arguments cbers_pds_bucket, cbers_stac_bucket, cbers_meta_pds_bucket, queue, message_batch_size, sns_reconcile_target_arn, catalog_update_queue, catalog_update_table, delete_processed_messages=False, ): """ Read quicklook queue and create STAC items if necessary. Input: cbers_pds_bucket(string): ditto cbers_stac_bucket(string): ditto cbers_meta_pds_bucket(string): ditto queue(string): SQS URL message_batch_size: maximum number of messages to be processed, 0 for all messages. sns_reconcile_target_arn: SNS arn for reconciled stac items topic catalog_update_queue(string): URL of queue that receives new STAC items for updating the catalog structure catalog_update_table: DynamoDB that hold the catalog update requests delete_processed_messages: if True messages are deleted from queue after processing """ buckets = { "cog": cbers_pds_bucket, "stac": cbers_stac_bucket, "metadata": cbers_meta_pds_bucket, } processed_messages = 0 for msg in sqs_messages(queue): process_message( msg, buckets, sns_reconcile_target_arn, catalog_update_queue, catalog_update_table, ) # Remove message from queue if delete_processed_messages: get_client("sqs").delete_message( QueueUrl=queue, ReceiptHandle=msg["ReceiptHandle"] ) processed_messages += 1 if processed_messages == message_batch_size: break
def write_catalog_to_s3(bucket, prefix, catalog): """ Uploads a catalog represented as a dictionary to bucket with prefix/catalog.json key. """ if "license" not in catalog: s3_catalog_file = prefix + "/catalog.json" else: s3_catalog_file = prefix + "/collection.json" get_client("s3").put_object(Body=json.dumps(catalog, indent=2), Bucket=bucket, Key=s3_catalog_file)
def catalog_update_request(table_name, stac_item_key): """ Generate a catalog structure update request by recording register into DynamoDB table. Input: stac_item_key(string): ditto table_name(string): DynamoDB table name """ get_client("dynamodb").put_item( TableName=table_name, Item={ "stacitem": {"S": stac_item_key}, "datetime": {"S": str(datetime.datetime.now())}, }, )
def process(self): """ Main processing """ response = get_client("dynamodb").scan(TableName=self._input_table, Limit=self._limit) self.__parse_items(response["Items"]) iterations = 1 while "LastEvaluatedKey" in response and iterations <= self._iterations: response = get_client("dynamodb").scan( TableName=self._input_table, Limit=self._limit, ExclusiveStartKey=response["LastEvaluatedKey"], ) self.__parse_items(response["Items"]) iterations += 1 # print(self._levels_to_be_updated) LOGGER.info("Number of stacitems: %d", len(self._items)) LOGGER.info("Number of levels to be updated: %d", len(self._levels_to_be_updated)) LOGGER.info("Start sending SQS messages") # Update catalog level table and send prefix to catalog update queue entries = list() for level in self._levels_to_be_updated: # This is only executed if the table is defined, currently # not used if self._output_table: response = get_client("dynamodb").put_item( TableName=self._output_table, Item={"catalog_level": { "S": level }}) entries.append({"Id": str(len(entries)), "MessageBody": level}) # get_client("sqs").send_message(QueueUrl=self._queue, # MessageBody=level) if len(entries) == 10: get_client("sqs").send_message_batch(QueueUrl=self._queue, Entries=entries) entries.clear() if entries: get_client("sqs").send_message_batch(QueueUrl=self._queue, Entries=entries) entries.clear() # Remove processed items LOGGER.info("Start deleting stacitems") for item in self._items: response = get_client("dynamodb").delete_item( TableName=self._input_table, Key={"stacitem": { "S": item["stacitem"]["S"] }}, ) LOGGER.info("Finished")
def get_catalogs_from_s3(bucket, prefix, response=None): """ Return a list with catalog (catalog.json files) located in S3 prefix. Assumes every subdir contains a catalog.json file Input: bucket(string): bucket name prefix(string): key prefix response(dict): S3 output from list_objects_v2, used for unit testing """ ret = list() if not response: response = get_client("s3").list_objects_v2(Bucket=bucket, Delimiter="/", Prefix=prefix) assert not response["IsTruncated"], "Truncated S3 listing" for item in response["CommonPrefixes"]: key = item["Prefix"].split("/")[-2] + "/catalog.json" ret.append({"rel": "child", "href": key}) return sorted(ret, key=itemgetter("href"))
def sqs_messages(queue): """ Generator for SQS messages. Input: queue(string): SQS URL. Ouput: dict with the following keys: key: Quicklook s3 key ReceiptHandle: Message receipt handle """ while True: response = get_client("sqs").receive_message(QueueUrl=queue) if "Messages" not in response: break retd = dict() retd["stac_item"] = response["Messages"][0]["Body"] retd["ReceiptHandle"] = response["Messages"][0]["ReceiptHandle"] yield retd
def get_items_from_s3(bucket, prefix, response=None): """ Return a list with items (.json files) located in S3 prefix. Input: bucket(string): bucket name prefix(string): key prefix response(dict): S3 output from list_objects_v2, used for unit testing """ ret = list() if not response: response = get_client("s3").list_objects_v2(Bucket=bucket, Delimiter="/", Prefix=prefix) assert not response["IsTruncated"], "Truncated S3 listing" for item in response["Contents"]: key = item["Key"].split("/")[-1] # Skip catalog.json files, including only L\d{1}.json if re.match(r".*L\d{1}.json", key): ret.append({"rel": "item", "href": key}) return sorted(ret, key=itemgetter("href"))
def send_stac_items_to_queue(bucket: str, queue: str, prefix: str) -> None: """send_stac_items_to_queue. Args: filter_params are passed directly to list_objects_v2 paginator. """ sqs_queue = get_resource("sqs").Queue(queue) s3_client = get_client("s3") paginator = s3_client.get_paginator("list_objects_v2") for result in paginator.paginate(Bucket=bucket, PaginationConfig={"PageSize": 1000}, Prefix=prefix): assert result["KeyCount"] <= 1000 entries = [{ "Id": str(index), "MessageBody": json.dumps({ "Message": json.dumps( json.loads( s3_client.get_object( Bucket=bucket, Key=obj["Key"])["Body"].read().decode("utf-8"))) }), } for index, obj in enumerate(result["Contents"]) if obj["Key"].split( "/")[-1] not in ["catalog.json", "collection.json"]] # Send items to queue at max 10 at a time chunk_size = 10 chunked_entries = [ entries[i * chunk_size:(i + 1) * chunk_size] for i in range((len(entries) + chunk_size - 1) // chunk_size) ] for chunk in chunked_entries: response = sqs_queue.send_messages(Entries=chunk) assert len(response["Successful"]) == len(chunk)
def process_message( msg, buckets, sns_target_arn, catalog_update_queue, catalog_update_table ): """ Process a single message. Generate STAC item, send STAC item to SNS topic, write key into DynamoDB table and, optionally, send key to queue for further processing. Input: msg(dict): message (quicklook) to be processed, quicklook s3 key is 'key'. buckets(dict): buckets for 'cog', 'stac' and 'metadata' sns_target_arn(string): SNS arn for stac items. Items are always published catalog_update_queue(string): URL of queue that receives new STAC items for updating the catalog structure, None if not used. catalog_update_table: DynamoDB that hold the catalog update requests """ LOGGER.info(msg["key"]) metadata_keys = get_s3_keys(msg["key"]) assert metadata_keys["quicklook_keys"]["camera"] in ( "MUX", "AWFI", "PAN10M", "PAN5M", "WPM", "WFI", ), ("Unrecognized key: " + metadata_keys["quicklook_keys"]["camera"]) local_inpe_metadata = "/tmp/" + metadata_keys["inpe_metadata"].split("/")[-1] local_stac_item = "/tmp/" + metadata_keys["stac"].split("/")[-1] # Download INPE metadata and generate STAC item file with open(local_inpe_metadata, "wb") as data: get_client("s3").download_fileobj( buckets["cog"], metadata_keys["inpe_metadata"], data, ExtraArgs={"RequestPayer": "requester"}, ) stac_meta = convert_inpe_to_stac( inpe_metadata_filename=local_inpe_metadata, stac_metadata_filename=local_stac_item, buckets=buckets, ) # Upload STAC item file with open(local_stac_item, "rb") as data: get_client("s3").upload_fileobj(data, buckets["stac"], metadata_keys["stac"]) # Publish to SNS topic get_client("sns").publish( TargetArn=sns_target_arn, Message=json.dumps(stac_meta), MessageAttributes=build_sns_topic_msg_attributes(stac_meta), ) # Send message to update catalog tree queue if catalog_update_queue: get_client("sqs").send_message( QueueUrl=catalog_update_queue, MessageBody=metadata_keys["stac"] ) # Request catalog update catalog_update_request( table_name=catalog_update_table, stac_item_key=metadata_keys["stac"] )