Esempio n. 1
0
def handler(event, context):  # pylint: disable=unused-argument
    """Lambda entry point
    Event keys:
      prefix(string): Common prefix of S3 keys to be sent to queue
    """

    if "queue" in event:
        # Consume payload data from job queue, one job only
        response = get_client("sqs").receive_message(QueueUrl=event["queue"])
        receipt_handle = response["Messages"][0]["ReceiptHandle"]
        populate_queue_with_quicklooks(
            bucket=os.environ["CBERS_PDS_BUCKET"],
            prefix=response["Messages"][0]["Body"],
            suffix=r"\.(jpg|png)",
            queue=os.environ["NEW_SCENES_QUEUE"],
        )
        # r_params.append(json.loads(response['Messages'][0]['Body']))
        # print(json.dumps(response, indent=2))
        get_client("sqs").delete_message(QueueUrl=event["queue"],
                                         ReceiptHandle=receipt_handle)

    else:
        # Lambda called from SQS trigger
        for record in event["Records"]:
            populate_queue_with_quicklooks(
                bucket=os.environ["CBERS_PDS_BUCKET"],
                prefix=record["body"],
                suffix=r"\.(jpg|png)",
                queue=os.environ["NEW_SCENES_QUEUE"],
            )
Esempio n. 2
0
def populate_queue_with_subdirs(bucket: str, prefix: str, queue: str) -> None:
    """
    Populate queue with messages containing S3 keys from
    'prefix', grouped by the first occurrence of '/' after
    'prefix'. If is required that the prefix ends with '/', which
    means that all the subdirs will be scanned.

    Input:
      bucket(string): STAC bucket
      prefix(string): ditto.
      queue(string): queue url
    """

    # No reason to run the function without scanning subdirs
    assert prefix[-1] == "/"

    dirs = get_client("s3").list_objects_v2(
        Bucket=bucket,
        Prefix=prefix,
        Delimiter="/",
    )

    # Paging is not supported here
    assert not dirs["IsTruncated"]
    for dir_key in dirs["CommonPrefixes"]:
        LOGGER.info(dir_key["Prefix"])
        get_client("sqs").send_message(QueueUrl=queue,
                                       MessageBody=dir_key["Prefix"])
Esempio n. 3
0
def consume_stac_reconcile_queue_handler(event, context):  # pylint: disable=unused-argument
    """Lambda entry point.
    Event keys:
      prefix(string): Common prefix of STAC items to be sent to queue
      queue(string): URL of the queue, optional. If set a single message is read
    """

    if "queue" in event:
        # Consume payload data from job queue, one job only
        response = get_client("sqs").receive_message(QueueUrl=event["queue"])
        receipt_handle = response["Messages"][0]["ReceiptHandle"]
        send_stac_items_to_queue(
            bucket=os.environ["CBERS_STAC_BUCKET"],
            prefix=response["Messages"][0]["Body"],
            queue=os.environ["insert_into_elasticsearch_queue_url"],
        )
        # r_params.append(json.loads(response['Messages'][0]['Body']))
        # print(json.dumps(response, indent=2))
        get_client("sqs").delete_message(QueueUrl=event["queue"],
                                         ReceiptHandle=receipt_handle)

    else:
        # Lambda called from SQS trigger
        for record in event["Records"]:
            send_stac_items_to_queue(
                bucket=os.environ["CBERS_STAC_BUCKET"],
                prefix=record["body"],
                queue=os.environ["insert_into_elasticsearch_queue_url"],
            )
Esempio n. 4
0
def populate_queue_with_quicklooks(bucket, prefix, suffix, queue):
    """
    Populate queue with items to be processed. The items are obtained
    from bucket/prefix/*/suffix
    """
    suffix = r".*" + suffix
    files = get_client("s3").list_objects_v2(Bucket=bucket,
                                             Prefix=prefix,
                                             RequestPayer="requester")

    while True:
        for file in files["Contents"]:
            if re.search(suffix, file["Key"]):
                # print(file['Key'])
                message = dict()
                message["Message"] = json.dumps({
                    "Records": [{
                        "s3": {
                            "object": {
                                "key": file["Key"],
                                "reconcile": 1
                            }
                        }
                    }]
                })
                get_client("sqs").send_message(QueueUrl=queue,
                                               MessageBody=json.dumps(message))
        if not files["IsTruncated"]:
            break
        files = get_client("s3").list_objects_v2(
            Bucket=bucket,
            Prefix=prefix,
            ContinuationToken=files["NextContinuationToken"],
            RequestPayer="requester",
        )
Esempio n. 5
0
def process_queue(  # pylint: disable=too-many-arguments
    cbers_pds_bucket,
    cbers_stac_bucket,
    cbers_meta_pds_bucket,
    queue,
    message_batch_size,
    sns_reconcile_target_arn,
    catalog_update_queue,
    catalog_update_table,
    delete_processed_messages=False,
):
    """
    Read quicklook queue and create STAC items if necessary.

    Input:
      cbers_pds_bucket(string): ditto
      cbers_stac_bucket(string): ditto
      cbers_meta_pds_bucket(string): ditto
      queue(string): SQS URL
      message_batch_size: maximum number of messages to be processed, 0 for
                          all messages.
      sns_reconcile_target_arn: SNS arn for reconciled stac items topic
      catalog_update_queue(string): URL of queue that receives new STAC
                                    items for updating the catalog structure
      catalog_update_table: DynamoDB that hold the catalog update requests
      delete_processed_messages: if True messages are deleted from queue
                                 after processing
    """

    buckets = {
        "cog": cbers_pds_bucket,
        "stac": cbers_stac_bucket,
        "metadata": cbers_meta_pds_bucket,
    }
    processed_messages = 0
    for msg in sqs_messages(queue):

        process_message(
            msg,
            buckets,
            sns_reconcile_target_arn,
            catalog_update_queue,
            catalog_update_table,
        )

        # Remove message from queue
        if delete_processed_messages:
            get_client("sqs").delete_message(
                QueueUrl=queue, ReceiptHandle=msg["ReceiptHandle"]
            )

        processed_messages += 1
        if processed_messages == message_batch_size:
            break
Esempio n. 6
0
def write_catalog_to_s3(bucket, prefix, catalog):
    """
    Uploads a catalog represented as a dictionary to bucket
    with prefix/catalog.json key.
    """

    if "license" not in catalog:
        s3_catalog_file = prefix + "/catalog.json"
    else:
        s3_catalog_file = prefix + "/collection.json"

    get_client("s3").put_object(Body=json.dumps(catalog, indent=2),
                                Bucket=bucket,
                                Key=s3_catalog_file)
Esempio n. 7
0
def catalog_update_request(table_name, stac_item_key):
    """
    Generate a catalog structure update request by recording
    register into DynamoDB table.

    Input:
      stac_item_key(string): ditto
      table_name(string): DynamoDB table name
    """

    get_client("dynamodb").put_item(
        TableName=table_name,
        Item={
            "stacitem": {"S": stac_item_key},
            "datetime": {"S": str(datetime.datetime.now())},
        },
    )
Esempio n. 8
0
    def process(self):
        """
        Main processing
        """
        response = get_client("dynamodb").scan(TableName=self._input_table,
                                               Limit=self._limit)
        self.__parse_items(response["Items"])
        iterations = 1
        while "LastEvaluatedKey" in response and iterations <= self._iterations:
            response = get_client("dynamodb").scan(
                TableName=self._input_table,
                Limit=self._limit,
                ExclusiveStartKey=response["LastEvaluatedKey"],
            )
            self.__parse_items(response["Items"])
            iterations += 1
        # print(self._levels_to_be_updated)
        LOGGER.info("Number of stacitems: %d", len(self._items))
        LOGGER.info("Number of levels to be updated: %d",
                    len(self._levels_to_be_updated))
        LOGGER.info("Start sending SQS messages")
        # Update catalog level table and send prefix to catalog update queue
        entries = list()
        for level in self._levels_to_be_updated:

            # This is only executed if the table is defined, currently
            # not used
            if self._output_table:
                response = get_client("dynamodb").put_item(
                    TableName=self._output_table,
                    Item={"catalog_level": {
                        "S": level
                    }})

            entries.append({"Id": str(len(entries)), "MessageBody": level})
            # get_client("sqs").send_message(QueueUrl=self._queue,
            #                         MessageBody=level)
            if len(entries) == 10:
                get_client("sqs").send_message_batch(QueueUrl=self._queue,
                                                     Entries=entries)
                entries.clear()
        if entries:
            get_client("sqs").send_message_batch(QueueUrl=self._queue,
                                                 Entries=entries)
            entries.clear()

        # Remove processed items
        LOGGER.info("Start deleting stacitems")
        for item in self._items:
            response = get_client("dynamodb").delete_item(
                TableName=self._input_table,
                Key={"stacitem": {
                    "S": item["stacitem"]["S"]
                }},
            )
        LOGGER.info("Finished")
Esempio n. 9
0
def get_catalogs_from_s3(bucket, prefix, response=None):
    """
    Return a list with catalog (catalog.json files) located in S3
    prefix. Assumes every subdir contains a catalog.json file

    Input:
    bucket(string): bucket name
    prefix(string): key prefix
    response(dict): S3 output from list_objects_v2, used for unit testing
    """
    ret = list()
    if not response:
        response = get_client("s3").list_objects_v2(Bucket=bucket,
                                                    Delimiter="/",
                                                    Prefix=prefix)
        assert not response["IsTruncated"], "Truncated S3 listing"
    for item in response["CommonPrefixes"]:
        key = item["Prefix"].split("/")[-2] + "/catalog.json"
        ret.append({"rel": "child", "href": key})
    return sorted(ret, key=itemgetter("href"))
Esempio n. 10
0
def sqs_messages(queue):
    """
    Generator for SQS messages.

    Input:
    queue(string): SQS URL.

    Ouput:
    dict with the following keys:
      key: Quicklook s3 key
      ReceiptHandle: Message receipt handle
    """

    while True:
        response = get_client("sqs").receive_message(QueueUrl=queue)
        if "Messages" not in response:
            break
        retd = dict()
        retd["stac_item"] = response["Messages"][0]["Body"]
        retd["ReceiptHandle"] = response["Messages"][0]["ReceiptHandle"]
        yield retd
Esempio n. 11
0
def get_items_from_s3(bucket, prefix, response=None):
    """
    Return a list with items (.json files) located in S3
    prefix.

    Input:
    bucket(string): bucket name
    prefix(string): key prefix
    response(dict): S3 output from list_objects_v2, used for unit testing
    """
    ret = list()
    if not response:
        response = get_client("s3").list_objects_v2(Bucket=bucket,
                                                    Delimiter="/",
                                                    Prefix=prefix)
        assert not response["IsTruncated"], "Truncated S3 listing"
    for item in response["Contents"]:
        key = item["Key"].split("/")[-1]
        # Skip catalog.json files, including only L\d{1}.json
        if re.match(r".*L\d{1}.json", key):
            ret.append({"rel": "item", "href": key})
    return sorted(ret, key=itemgetter("href"))
Esempio n. 12
0
def send_stac_items_to_queue(bucket: str, queue: str, prefix: str) -> None:
    """send_stac_items_to_queue.

    Args:
      filter_params are passed directly to list_objects_v2 paginator.
    """
    sqs_queue = get_resource("sqs").Queue(queue)
    s3_client = get_client("s3")
    paginator = s3_client.get_paginator("list_objects_v2")
    for result in paginator.paginate(Bucket=bucket,
                                     PaginationConfig={"PageSize": 1000},
                                     Prefix=prefix):
        assert result["KeyCount"] <= 1000
        entries = [{
            "Id":
            str(index),
            "MessageBody":
            json.dumps({
                "Message":
                json.dumps(
                    json.loads(
                        s3_client.get_object(
                            Bucket=bucket,
                            Key=obj["Key"])["Body"].read().decode("utf-8")))
            }),
        } for index, obj in enumerate(result["Contents"]) if obj["Key"].split(
            "/")[-1] not in ["catalog.json", "collection.json"]]
        # Send items to queue at max 10 at a time
        chunk_size = 10
        chunked_entries = [
            entries[i * chunk_size:(i + 1) * chunk_size]
            for i in range((len(entries) + chunk_size - 1) // chunk_size)
        ]
        for chunk in chunked_entries:
            response = sqs_queue.send_messages(Entries=chunk)
            assert len(response["Successful"]) == len(chunk)
Esempio n. 13
0
def process_message(
    msg, buckets, sns_target_arn, catalog_update_queue, catalog_update_table
):
    """
    Process a single message. Generate STAC item, send STAC item to SNS topic,
    write key into DynamoDB table and, optionally, send key to queue for
    further processing.

    Input:
      msg(dict): message (quicklook) to be processed, quicklook s3 key is 'key'.
      buckets(dict): buckets for 'cog', 'stac' and 'metadata'
      sns_target_arn(string): SNS arn for stac items. Items are always published
      catalog_update_queue(string): URL of queue that receives new STAC items
        for updating the catalog structure, None if not used.
      catalog_update_table: DynamoDB that hold the catalog update requests
    """

    LOGGER.info(msg["key"])
    metadata_keys = get_s3_keys(msg["key"])

    assert metadata_keys["quicklook_keys"]["camera"] in (
        "MUX",
        "AWFI",
        "PAN10M",
        "PAN5M",
        "WPM",
        "WFI",
    ), ("Unrecognized key: " + metadata_keys["quicklook_keys"]["camera"])

    local_inpe_metadata = "/tmp/" + metadata_keys["inpe_metadata"].split("/")[-1]
    local_stac_item = "/tmp/" + metadata_keys["stac"].split("/")[-1]
    # Download INPE metadata and generate STAC item file
    with open(local_inpe_metadata, "wb") as data:
        get_client("s3").download_fileobj(
            buckets["cog"],
            metadata_keys["inpe_metadata"],
            data,
            ExtraArgs={"RequestPayer": "requester"},
        )
    stac_meta = convert_inpe_to_stac(
        inpe_metadata_filename=local_inpe_metadata,
        stac_metadata_filename=local_stac_item,
        buckets=buckets,
    )
    # Upload STAC item file
    with open(local_stac_item, "rb") as data:
        get_client("s3").upload_fileobj(data, buckets["stac"], metadata_keys["stac"])

    # Publish to SNS topic
    get_client("sns").publish(
        TargetArn=sns_target_arn,
        Message=json.dumps(stac_meta),
        MessageAttributes=build_sns_topic_msg_attributes(stac_meta),
    )

    # Send message to update catalog tree queue
    if catalog_update_queue:
        get_client("sqs").send_message(
            QueueUrl=catalog_update_queue, MessageBody=metadata_keys["stac"]
        )

    # Request catalog update
    catalog_update_request(
        table_name=catalog_update_table, stac_item_key=metadata_keys["stac"]
    )