Exemple #1
0
def do_index(
        s3_client,
        doc_queue: DocumentQueue,
        event_type: str,
        *,
        bucket: str,
        etag: str,
        ext: str,
        key: str,
        last_modified: str,
        text: str = '',
        size: int = 0,
        version_id: Optional[str] = None,
):
    """wrap dual indexing of packages and objects"""
    logger_ = get_quilt_logger()
    # index as object (always)
    logger_.debug("%s to indexing queue (%s)", key, event_type)
    doc_queue.append(
        event_type,
        DocTypes.OBJECT,
        bucket=bucket,
        ext=ext,
        etag=etag,
        key=key,
        last_modified=last_modified,
        size=size,
        text=text,
        version_id=version_id
    )
    # maybe index as package
    if index_if_package(
        s3_client,
        doc_queue,
        event_type,
        bucket=bucket,
        etag=etag,
        ext=ext,
        key=key,
        last_modified=last_modified,
        size=size,
        version_id=version_id,
    ):
        logger_.debug("%s indexed as package (%s)", key, event_type)
Exemple #2
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data, queue events, send to
    elastic via bulk() API
    """
    logger_ = get_quilt_logger()
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    # An exception that we'll want to re-raise after the batch sends
    content_exception = None
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            if body_message.get("Event") == TEST_EVENT:
                logger_.debug("Skipping S3 Test Event")
                # Consume and ignore this event, which is an initial message from
                # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331
                continue
            print("Unexpected message['body']. No 'Records' key.", message)
            raise Exception("Unexpected message['body']. No 'Records' key.")
        batch_processor = DocumentQueue(context)
        events = body_message.get("Records", [])
        s3_client = make_s3_client()
        # event is a single S3 event
        for event_ in events:
            logger_.debug("Processing %s", event_)
            try:
                event_name = event_["eventName"]
                # Process all Create:* and Remove:* events
                if not any(
                        event_name.startswith(n)
                        for n in EVENT_PREFIX.values()):
                    continue
                bucket = unquote(event_["s3"]["bucket"]["name"])
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId")
                version_id = unquote(version_id) if version_id else None
                # Skip delete markers when versioning is on
                if version_id and event_name == "ObjectRemoved:DeleteMarkerCreated":
                    continue
                # ObjectRemoved:Delete does not include "eTag"
                etag = unquote(event_["s3"]["object"].get("eTag", ""))
                # Get two levels of extensions to handle files like .csv.gz
                path = pathlib.PurePosixPath(key)
                ext1 = path.suffix
                ext2 = path.with_suffix('').suffix
                ext = (ext2 + ext1).lower()

                # Handle delete first and then continue so that
                # head_object and get_object (below) don't fail
                if event_name.startswith(EVENT_PREFIX["Removed"]):
                    logger_.debug("Object delete to queue")
                    batch_processor.append(event_name,
                                           DocTypes.OBJECT,
                                           bucket=bucket,
                                           ext=ext,
                                           etag=etag,
                                           key=key,
                                           last_modified=now_like_boto3(),
                                           text="",
                                           version_id=version_id)
                    continue

                try:
                    logger_.debug("Get object head")
                    head = retry_s3("head",
                                    bucket,
                                    key,
                                    s3_client=s3_client,
                                    version_id=version_id,
                                    etag=etag)
                except botocore.exceptions.ClientError as first:
                    logger_.warning("head_object error: %s", first)
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (first.response.get('Error', {}).get('Code') == "403"
                            and version_id == "null"):
                        try:
                            head = retry_s3("head",
                                            bucket,
                                            key,
                                            s3_client=s3_client,
                                            version_id=None,
                                            etag=etag)
                        except botocore.exceptions.ClientError as second:
                            # this will bypass the DLQ but that's the right thing to do
                            # as some listed objects may NEVER succeed head requests
                            # (e.g. foreign owner) and there's no reason to torpedo
                            # the whole batch (which might include good files)
                            logger_.warning("Retried head_object error: %s",
                                            second)

                    logger_.error("Fatal head_object, skipping event: %s",
                                  event_)
                    continue

                size = head["ContentLength"]
                last_modified = head["LastModified"]

                did_index = index_if_manifest(s3_client,
                                              batch_processor,
                                              event_name,
                                              bucket=bucket,
                                              etag=etag,
                                              ext=ext,
                                              key=key,
                                              last_modified=last_modified,
                                              size=size,
                                              version_id=version_id)
                logger_.debug("Logged as manifest? %s", did_index)

                try:
                    text = maybe_get_contents(bucket,
                                              key,
                                              ext,
                                              etag=etag,
                                              version_id=version_id,
                                              s3_client=s3_client,
                                              size=size)
                # we still want an entry for this document in elastic so that, e.g.,
                # the file counts from elastic are correct. re-raise below.
                except Exception as exc:  # pylint: disable=broad-except
                    text = ""
                    content_exception = exc
                    logger_.error("Content extraction failed %s %s %s", bucket,
                                  key, exc)

                batch_processor.append(event_name,
                                       DocTypes.OBJECT,
                                       bucket=bucket,
                                       key=key,
                                       ext=ext,
                                       etag=etag,
                                       version_id=version_id,
                                       last_modified=last_modified,
                                       size=size,
                                       text=text)

            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    logger_.warning("Got exception but retrying: %s", boto_exc)
                    continue
                logger_.critical("Failed record: %s, %s", event, boto_exc)
                raise boto_exc
        # flush the queue
        batch_processor.send_all()
        # note: if there are multiple content exceptions in the batch, this will
        # only raise the most recent one;
        # re-raise so that get_contents() failures end up in the DLQ
        if content_exception:
            logger_.critical("Failed batch due to %s", content_exception)
            raise content_exception
Exemple #3
0
def index_if_manifest(s3_client, doc_queue: DocumentQueue, event_type: str, *,
                      bucket: str, etag: str, ext: str, key: str,
                      last_modified: str, version_id: Optional[str],
                      size: int) -> bool:
    """index manifest files as package documents in ES
        Returns:
            - True if manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3
            or '/' not in handle):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
    except ValueError as err:
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket,
                      key, err)
        # this is probably the latest pointer, skip it. manifest already indexed.
        return False
    else:
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket,
                            key)
            return False

    package_hash = get_plain_text(
        bucket,
        key,
        size,
        None,
        etag=etag,
        s3_client=s3_client,
        version_id=version_id,
    ).strip()

    manifest_key = f"{MANIFEST_PREFIX_V1}{package_hash}"
    first = select_manifest_meta(s3_client, bucket, manifest_key)
    stats = select_package_stats(s3_client, bucket, manifest_key)
    if not first:
        logger_.error("S3 select failed %s %s", bucket, manifest_key)
        return False
    try:
        first_dict = json.loads(first)
        doc_queue.append(
            event_type,
            DocTypes.PACKAGE,
            bucket=bucket,
            etag=etag,
            ext=ext,
            handle=handle,
            key=manifest_key,
            last_modified=last_modified,
            package_hash=package_hash,
            package_stats=stats,
            pointer_file=pointer_file,
            comment=str(first_dict.get("message", "")),
            metadata=json.dumps(first_dict.get("user_meta", {})),
        )
        return True
    except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc:
        print(f"{exc}\n"
              f"\tFailed to select first line of manifest s3://{bucket}/{key}."
              f"\tGot {first}.")
        return False
Exemple #4
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data and metadata,
    queue events, send to elastic via bulk() API
    """
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    # An exception that we'll want to re-raise after the batch sends
    content_exception = None
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            if body_message.get("Event") == TEST_EVENT:
                # Consume and ignore this event, which is an initial message from
                # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331
                continue
            else:
                print("Unexpected message['body']. No 'Records' key.", message)
                raise Exception("Unexpected message['body']. No 'Records' key.")
        batch_processor = DocumentQueue(context)
        events = body_message.get("Records", [])
        s3_client = make_s3_client()
        # event is a single S3 event
        for event_ in events:
            try:
                event_name = event_["eventName"]
                # only process these two event types
                if event_name not in [OBJECT_DELETE, OBJECT_PUT]:
                    continue
                bucket = unquote(event_["s3"]["bucket"]["name"])
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId")
                version_id = unquote(version_id) if version_id else None
                etag = unquote(event_["s3"]["object"]["eTag"])

                # Get two levels of extensions to handle files like .csv.gz
                path = pathlib.PurePosixPath(key)
                ext1 = path.suffix
                ext2 = path.with_suffix('').suffix
                ext = (ext2 + ext1).lower()

                # Handle delete  first and then continue so that
                # head_object and get_object (below) don't fail
                if event_name == OBJECT_DELETE:
                    batch_processor.append(
                        event_name,
                        bucket=bucket,
                        ext=ext,
                        etag=etag,
                        key=key,
                        last_modified=now_like_boto3(),
                        text="",
                        version_id=version_id
                    )
                    continue

                try:
                    head = retry_s3(
                        "head",
                        bucket,
                        key,
                        s3_client=s3_client,
                        version_id=version_id,
                        etag=etag
                    )
                except botocore.exceptions.ClientError as exception:
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (exception.response.get('Error', {}).get('Code') == "403"
                            and version_id == "null"):
                        head = retry_s3(
                            "head",
                            bucket,
                            key,
                            s3_client=s3_client,
                            version_id=None,
                            etag=etag
                        )
                    else:
                        raise exception

                size = head["ContentLength"]
                last_modified = head["LastModified"]
                meta = head["Metadata"]

                try:
                    text = get_contents(
                        bucket,
                        key,
                        ext,
                        etag=etag,
                        version_id=version_id,
                        s3_client=s3_client,
                        size=size
                    )
                # we still want an entry for this document in elastic so that, e.g.,
                # the file counts from elastic are correct. re-raise below.
                except Exception as exc:#pylint: disable=broad-except
                    text = ""
                    content_exception = exc
                    print("Content extraction failed", exc, bucket, key, etag, version_id)

                # decode Quilt-specific metadata
                if meta and "helium" in meta:
                    try:
                        decoded_helium = json.loads(meta["helium"])
                        meta["helium"] = decoded_helium or {}
                    except (KeyError, json.JSONDecodeError):
                        print("Unable to parse Quilt 'helium' metadata", meta)

                batch_processor.append(
                    event_name,
                    bucket=bucket,
                    key=key,
                    ext=ext,
                    meta=meta,
                    etag=etag,
                    version_id=version_id,
                    last_modified=last_modified,
                    size=size,
                    text=text
                )
            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    continue
                else:
                    print("Fatal exception for record", event_, boto_exc)
                    import traceback
                    traceback.print_tb(boto_exc.__traceback__)
                    raise boto_exc
        # flush the queue
        batch_processor.send_all()
        # note: if there are multiple content exceptions in the batch, this will
        # only raise the most recent one;
        # re-raise so that get_contents() failures end up in the DLQ
        if content_exception:
            raise content_exception
Exemple #5
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data, queue events, send to
    elastic via bulk() API
    """
    logger_ = get_quilt_logger()
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    # An exception that we'll want to re-raise after the batch sends
    content_exception = None
    batch_processor = DocumentQueue(context)
    s3_client = make_s3_client()
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            # could be TEST_EVENT, or another unexpected event; skip it
            logger_.error("No 'Records' key in message['body']: %s", message)
            continue
        events = body_message["Records"]
        # event is a single S3 event
        for event_ in events:
            validated = shape_event(event_)
            if not validated:
                logger_.debug("Skipping invalid event %s", event_)
                continue
            event_ = validated
            logger_.debug("Processing %s", event_)
            try:
                event_name = event_["eventName"]
                # Process all Create:* and Remove:* events
                if not any(event_name.startswith(n) for n in EVENT_PREFIX.values()):
                    logger_.warning("Skipping unknown event type: %s", event_name)
                    continue
                bucket = event_["s3"]["bucket"]["name"]
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                # TODO: check if eventbridge events do the same thing with +
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId", None)
                # ObjectRemoved:Delete does not include "eTag"
                etag = event_["s3"]["object"].get("eTag", "")
                # synthetic events from bulk scanner might define lastModified
                last_modified = (
                    event_["s3"]["object"].get("lastModified") or event_["eventTime"]
                )
                # Get two levels of extensions to handle files like .csv.gz
                path = pathlib.PurePosixPath(key)
                ext1 = path.suffix
                ext2 = path.with_suffix('').suffix
                ext = (ext2 + ext1).lower()
                # Handle delete and deletemarker first and then continue so that
                # head_object and get_object (below) don't fail
                if event_name.startswith(EVENT_PREFIX["Removed"]):
                    do_index(
                        s3_client,
                        batch_processor,
                        event_name,
                        bucket=bucket,
                        etag=etag,
                        ext=ext,
                        key=key,
                        last_modified=last_modified,
                        version_id=version_id
                    )
                    continue
                try:
                    head = retry_s3(
                        "head",
                        bucket,
                        key,
                        s3_client=s3_client,
                        version_id=version_id,
                        etag=etag
                    )
                except botocore.exceptions.ClientError as first:
                    logger_.warning("head_object error: %s", first)
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (first.response.get('Error', {}).get('Code') == "403"
                            and version_id == "null"):
                        try:
                            head = retry_s3(
                                "head",
                                bucket,
                                key,
                                s3_client=s3_client,
                                version_id=None,
                                etag=etag
                            )
                        except botocore.exceptions.ClientError as second:
                            # this will bypass the DLQ but that's the right thing to do
                            # as some listed objects may NEVER succeed head requests
                            # (e.g. foreign owner) and there's no reason to torpedo
                            # the whole batch (which might include good files)
                            logger_.warning("Retried head_object error: %s", second)
                    logger_.error("Fatal head_object, skipping event: %s", event_)
                    continue
                # backfill fields based on the head_object
                size = head["ContentLength"]
                last_modified = last_modified or head["LastModified"].isoformat()
                etag = head.get("etag") or etag
                version_id = head.get("VersionId") or version_id
                try:
                    text = maybe_get_contents(
                        bucket,
                        key,
                        ext,
                        etag=etag,
                        version_id=version_id,
                        s3_client=s3_client,
                        size=size
                    )
                # we still want an entry for this document in elastic so that, e.g.,
                # the file counts from elastic are correct
                # these exceptions can happen for a variety of reasons (e.g. glacier
                # storage class, index event arrives after delete has occurred, etc.)
                # given how common they are, we shouldn't fail the batch for this
                except Exception as exc:  # pylint: disable=broad-except
                    text = ""
                    logger_.warning("Content extraction failed %s %s %s", bucket, key, exc)

                do_index(
                    s3_client,
                    batch_processor,
                    event_name,
                    bucket=bucket,
                    etag=etag,
                    ext=ext,
                    key=key,
                    last_modified=last_modified,
                    size=size,
                    text=text,
                    version_id=version_id
                )

            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    logger_.warning("Skipping non-fatal exception: %s", boto_exc)
                    continue
                logger_.critical("Failed record: %s, %s", event, boto_exc)
                raise boto_exc
    # flush the queue
    batch_processor.send_all()
Exemple #6
0
def index_if_package(
        s3_client,
        doc_queue: DocumentQueue,
        *,
        bucket: str,
        etag: str,
        key: str,
        last_modified: str,
        version_id: Optional[str],
) -> bool:
    """index manifest pointer files as package documents in ES
        Returns:
            - True if pointer to manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (
            not pointer_file
            or not pointer_prefix.startswith(POINTER_PREFIX_V1)
            or len(handle) < 3
            or '/' not in handle
    ):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key)
            return False
    except ValueError as err:
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err)

    def get_pkg_data():
        try:
            package_hash = s3_client.get_object(
                Bucket=bucket,
                Key=key,
            )['Body'].read().decode()
        except botocore.exceptions.ClientError:
            return

        manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}'
        first = select_manifest_meta(s3_client, bucket, manifest_key)
        if not first:
            return
        stats = select_package_stats(s3_client, bucket, manifest_key)
        if not stats:
            return

        return {
            "key": key,
            "etag": etag,
            "version_id": version_id,
            "last_modified": last_modified,
            "delete_marker": False,  # TODO: remove
            "handle": handle,
            "pointer_file": pointer_file,
            "hash": package_hash,
            "package_stats": stats,
            "metadata": json.dumps(first.get("user_meta", {})),
            "comment": str(first.get("message", "")),
        }

    data = get_pkg_data() or {}
    doc_queue.append_document({
        "_index": bucket + PACKAGE_INDEX_SUFFIX,
        "_id": key,
        "_op_type": "index" if data else "delete",
        **data,
    })

    return True
Exemple #7
0
def index_if_package(
        s3_client,
        doc_queue: DocumentQueue,
        event_type: str,
        *,
        bucket: str,
        etag: str,
        ext: str,
        key: str,
        last_modified: str,
        version_id: Optional[str],
        size: int
) -> bool:
    """index manifest pointer files as package documents in ES
        Returns:
            - True if pointer to manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (
            not pointer_file
            or not pointer_prefix.startswith(POINTER_PREFIX_V1)
            or len(handle) < 3
            or '/' not in handle
    ):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
        is_tag = False
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key)
            return False
    except ValueError as err:
        is_tag = True
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err)

    package_hash = ''
    first_dict = {}
    stats = None
    # we only need to get manifest contents for proper create events (not latest pointers)
    if event_type.startswith(EVENT_PREFIX["Created"]) and not is_tag:
        package_hash = get_plain_text(
            bucket,
            key,
            size,
            None,
            etag=etag,
            s3_client=s3_client,
            version_id=version_id,
        ).strip()
        manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}'
        first = select_manifest_meta(s3_client, bucket, manifest_key)
        stats = select_package_stats(s3_client, bucket, manifest_key)
        if not first:
            logger_.error("S3 select failed %s %s", bucket, manifest_key)
            return False
        try:
            first_dict = json.loads(first)
        except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc:
            print(
                f"{exc}\n"
                f"\tFailed to select first line of manifest s3://{bucket}/{key}."
                f"\tGot {first}."
            )
            return False

    doc_queue.append(
        event_type,
        DocTypes.PACKAGE,
        bucket=bucket,
        etag=etag,
        ext=ext,
        handle=handle,
        key=key,
        last_modified=last_modified,
        # if we don't have the hash, we're processing a tag
        package_hash=(package_hash or pointer_file),
        package_stats=stats,
        pointer_file=pointer_file,
        comment=str(first_dict.get("message", "")),
        metadata=json.dumps(first_dict.get("user_meta", {})),
        version_id=version_id,
    )

    return True
Exemple #8
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data and metadata,
    queue events, send to elastic via bulk() API
    """
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            if body_message.get("Event") == TEST_EVENT:
                # Consume and ignore this event, which is an initial message from
                # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331
                continue
            else:
                print("Unexpected message['body']. No 'Records' key.", message)
                raise Exception(
                    "Unexpected message['body']. No 'Records' key.")
        batch_processor = DocumentQueue(context)
        events = body_message.get("Records", [])
        s3_client = make_s3_client()
        # event is a single S3 event
        for event_ in events:
            try:
                event_name = event_["eventName"]
                bucket = unquote(event_["s3"]["bucket"]["name"])
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId")
                version_id = unquote(version_id) if version_id else None
                etag = unquote(event_["s3"]["object"]["eTag"])

                ext = pathlib.PurePosixPath(key).suffix.lower()

                try:
                    head = retry_s3("head",
                                    bucket,
                                    key,
                                    s3_client=s3_client,
                                    version_id=version_id,
                                    etag=etag)
                except botocore.exceptions.ClientError as exception:
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (exception.response.get('Error', {}).get('Code')
                            == "403" and version_id == "null"):
                        head = retry_s3("head",
                                        bucket,
                                        key,
                                        s3_client=s3_client,
                                        version_id=None,
                                        etag=etag)
                    else:
                        raise exception

                size = head["ContentLength"]
                last_modified = head["LastModified"]
                meta = head["Metadata"]
                text = ""

                if event_name == OBJECT_DELETE:
                    batch_processor.append(event_name,
                                           bucket=bucket,
                                           ext=ext,
                                           etag=etag,
                                           key=key,
                                           last_modified=last_modified,
                                           text=text,
                                           version_id=version_id)
                    continue

                text = get_contents(bucket,
                                    key,
                                    ext,
                                    etag=etag,
                                    version_id=version_id,
                                    s3_client=s3_client,
                                    size=size)
                # decode Quilt-specific metadata
                if meta and "helium" in meta:
                    try:
                        decoded_helium = json.loads(meta["helium"])
                        meta["helium"] = decoded_helium or {}
                    except (KeyError, json.JSONDecodeError):
                        print("Unable to parse Quilt 'helium' metadata", meta)

                batch_processor.append(event_name,
                                       bucket=bucket,
                                       key=key,
                                       ext=ext,
                                       meta=meta,
                                       etag=etag,
                                       version_id=version_id,
                                       last_modified=last_modified,
                                       size=size,
                                       text=text)
            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    continue
                else:
                    print("Fatal exception for record", event_, boto_exc)
                    import traceback
                    traceback.print_tb(boto_exc.__traceback__)
                    raise boto_exc
            except Exception as exc:  # pylint: disable=broad-except
                print("Fatal exception for record", event_, exc)
                import traceback
                traceback.print_tb(exc.__traceback__)
                raise exc
        # flush the queue
        batch_processor.send_all()