def do_index( s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, text: str = '', size: int = 0, version_id: Optional[str] = None, ): """wrap dual indexing of packages and objects""" logger_ = get_quilt_logger() # index as object (always) logger_.debug("%s to indexing queue (%s)", key, event_type) doc_queue.append( event_type, DocTypes.OBJECT, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=last_modified, size=size, text=text, version_id=version_id ) # maybe index as package if index_if_package( s3_client, doc_queue, event_type, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, version_id=version_id, ): logger_.debug("%s indexed as package (%s)", key, event_type)
def handler(event, context): """enumerate S3 keys in event, extract relevant data, queue events, send to elastic via bulk() API """ logger_ = get_quilt_logger() # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py # An exception that we'll want to re-raise after the batch sends content_exception = None for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: if body_message.get("Event") == TEST_EVENT: logger_.debug("Skipping S3 Test Event") # Consume and ignore this event, which is an initial message from # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331 continue print("Unexpected message['body']. No 'Records' key.", message) raise Exception("Unexpected message['body']. No 'Records' key.") batch_processor = DocumentQueue(context) events = body_message.get("Records", []) s3_client = make_s3_client() # event is a single S3 event for event_ in events: logger_.debug("Processing %s", event_) try: event_name = event_["eventName"] # Process all Create:* and Remove:* events if not any( event_name.startswith(n) for n in EVENT_PREFIX.values()): continue bucket = unquote(event_["s3"]["bucket"]["name"]) # In the grand tradition of IE6, S3 events turn spaces into '+' key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId") version_id = unquote(version_id) if version_id else None # Skip delete markers when versioning is on if version_id and event_name == "ObjectRemoved:DeleteMarkerCreated": continue # ObjectRemoved:Delete does not include "eTag" etag = unquote(event_["s3"]["object"].get("eTag", "")) # Get two levels of extensions to handle files like .csv.gz path = pathlib.PurePosixPath(key) ext1 = path.suffix ext2 = path.with_suffix('').suffix ext = (ext2 + ext1).lower() # Handle delete first and then continue so that # head_object and get_object (below) don't fail if event_name.startswith(EVENT_PREFIX["Removed"]): logger_.debug("Object delete to queue") batch_processor.append(event_name, DocTypes.OBJECT, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=now_like_boto3(), text="", version_id=version_id) continue try: logger_.debug("Get object head") head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag) except botocore.exceptions.ClientError as first: logger_.warning("head_object error: %s", first) # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (first.response.get('Error', {}).get('Code') == "403" and version_id == "null"): try: head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=None, etag=etag) except botocore.exceptions.ClientError as second: # this will bypass the DLQ but that's the right thing to do # as some listed objects may NEVER succeed head requests # (e.g. foreign owner) and there's no reason to torpedo # the whole batch (which might include good files) logger_.warning("Retried head_object error: %s", second) logger_.error("Fatal head_object, skipping event: %s", event_) continue size = head["ContentLength"] last_modified = head["LastModified"] did_index = index_if_manifest(s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, version_id=version_id) logger_.debug("Logged as manifest? %s", did_index) try: text = maybe_get_contents(bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size) # we still want an entry for this document in elastic so that, e.g., # the file counts from elastic are correct. re-raise below. except Exception as exc: # pylint: disable=broad-except text = "" content_exception = exc logger_.error("Content extraction failed %s %s %s", bucket, key, exc) batch_processor.append(event_name, DocTypes.OBJECT, bucket=bucket, key=key, ext=ext, etag=etag, version_id=version_id, last_modified=last_modified, size=size, text=text) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): logger_.warning("Got exception but retrying: %s", boto_exc) continue logger_.critical("Failed record: %s, %s", event, boto_exc) raise boto_exc # flush the queue batch_processor.send_all() # note: if there are multiple content exceptions in the batch, this will # only raise the most recent one; # re-raise so that get_contents() failures end up in the DLQ if content_exception: logger_.critical("Failed batch due to %s", content_exception) raise content_exception
def index_if_manifest(s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, version_id: Optional[str], size: int) -> bool: """index manifest files as package documents in ES Returns: - True if manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if (not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) except ValueError as err: logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) # this is probably the latest pointer, skip it. manifest already indexed. return False else: if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False package_hash = get_plain_text( bucket, key, size, None, etag=etag, s3_client=s3_client, version_id=version_id, ).strip() manifest_key = f"{MANIFEST_PREFIX_V1}{package_hash}" first = select_manifest_meta(s3_client, bucket, manifest_key) stats = select_package_stats(s3_client, bucket, manifest_key) if not first: logger_.error("S3 select failed %s %s", bucket, manifest_key) return False try: first_dict = json.loads(first) doc_queue.append( event_type, DocTypes.PACKAGE, bucket=bucket, etag=etag, ext=ext, handle=handle, key=manifest_key, last_modified=last_modified, package_hash=package_hash, package_stats=stats, pointer_file=pointer_file, comment=str(first_dict.get("message", "")), metadata=json.dumps(first_dict.get("user_meta", {})), ) return True except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc: print(f"{exc}\n" f"\tFailed to select first line of manifest s3://{bucket}/{key}." f"\tGot {first}.") return False
def handler(event, context): """enumerate S3 keys in event, extract relevant data and metadata, queue events, send to elastic via bulk() API """ # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py # An exception that we'll want to re-raise after the batch sends content_exception = None for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: if body_message.get("Event") == TEST_EVENT: # Consume and ignore this event, which is an initial message from # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331 continue else: print("Unexpected message['body']. No 'Records' key.", message) raise Exception("Unexpected message['body']. No 'Records' key.") batch_processor = DocumentQueue(context) events = body_message.get("Records", []) s3_client = make_s3_client() # event is a single S3 event for event_ in events: try: event_name = event_["eventName"] # only process these two event types if event_name not in [OBJECT_DELETE, OBJECT_PUT]: continue bucket = unquote(event_["s3"]["bucket"]["name"]) # In the grand tradition of IE6, S3 events turn spaces into '+' key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId") version_id = unquote(version_id) if version_id else None etag = unquote(event_["s3"]["object"]["eTag"]) # Get two levels of extensions to handle files like .csv.gz path = pathlib.PurePosixPath(key) ext1 = path.suffix ext2 = path.with_suffix('').suffix ext = (ext2 + ext1).lower() # Handle delete first and then continue so that # head_object and get_object (below) don't fail if event_name == OBJECT_DELETE: batch_processor.append( event_name, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=now_like_boto3(), text="", version_id=version_id ) continue try: head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag ) except botocore.exceptions.ClientError as exception: # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (exception.response.get('Error', {}).get('Code') == "403" and version_id == "null"): head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=None, etag=etag ) else: raise exception size = head["ContentLength"] last_modified = head["LastModified"] meta = head["Metadata"] try: text = get_contents( bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size ) # we still want an entry for this document in elastic so that, e.g., # the file counts from elastic are correct. re-raise below. except Exception as exc:#pylint: disable=broad-except text = "" content_exception = exc print("Content extraction failed", exc, bucket, key, etag, version_id) # decode Quilt-specific metadata if meta and "helium" in meta: try: decoded_helium = json.loads(meta["helium"]) meta["helium"] = decoded_helium or {} except (KeyError, json.JSONDecodeError): print("Unable to parse Quilt 'helium' metadata", meta) batch_processor.append( event_name, bucket=bucket, key=key, ext=ext, meta=meta, etag=etag, version_id=version_id, last_modified=last_modified, size=size, text=text ) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): continue else: print("Fatal exception for record", event_, boto_exc) import traceback traceback.print_tb(boto_exc.__traceback__) raise boto_exc # flush the queue batch_processor.send_all() # note: if there are multiple content exceptions in the batch, this will # only raise the most recent one; # re-raise so that get_contents() failures end up in the DLQ if content_exception: raise content_exception
def handler(event, context): """enumerate S3 keys in event, extract relevant data, queue events, send to elastic via bulk() API """ logger_ = get_quilt_logger() # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py # An exception that we'll want to re-raise after the batch sends content_exception = None batch_processor = DocumentQueue(context) s3_client = make_s3_client() for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: # could be TEST_EVENT, or another unexpected event; skip it logger_.error("No 'Records' key in message['body']: %s", message) continue events = body_message["Records"] # event is a single S3 event for event_ in events: validated = shape_event(event_) if not validated: logger_.debug("Skipping invalid event %s", event_) continue event_ = validated logger_.debug("Processing %s", event_) try: event_name = event_["eventName"] # Process all Create:* and Remove:* events if not any(event_name.startswith(n) for n in EVENT_PREFIX.values()): logger_.warning("Skipping unknown event type: %s", event_name) continue bucket = event_["s3"]["bucket"]["name"] # In the grand tradition of IE6, S3 events turn spaces into '+' # TODO: check if eventbridge events do the same thing with + key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId", None) # ObjectRemoved:Delete does not include "eTag" etag = event_["s3"]["object"].get("eTag", "") # synthetic events from bulk scanner might define lastModified last_modified = ( event_["s3"]["object"].get("lastModified") or event_["eventTime"] ) # Get two levels of extensions to handle files like .csv.gz path = pathlib.PurePosixPath(key) ext1 = path.suffix ext2 = path.with_suffix('').suffix ext = (ext2 + ext1).lower() # Handle delete and deletemarker first and then continue so that # head_object and get_object (below) don't fail if event_name.startswith(EVENT_PREFIX["Removed"]): do_index( s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, version_id=version_id ) continue try: head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag ) except botocore.exceptions.ClientError as first: logger_.warning("head_object error: %s", first) # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (first.response.get('Error', {}).get('Code') == "403" and version_id == "null"): try: head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=None, etag=etag ) except botocore.exceptions.ClientError as second: # this will bypass the DLQ but that's the right thing to do # as some listed objects may NEVER succeed head requests # (e.g. foreign owner) and there's no reason to torpedo # the whole batch (which might include good files) logger_.warning("Retried head_object error: %s", second) logger_.error("Fatal head_object, skipping event: %s", event_) continue # backfill fields based on the head_object size = head["ContentLength"] last_modified = last_modified or head["LastModified"].isoformat() etag = head.get("etag") or etag version_id = head.get("VersionId") or version_id try: text = maybe_get_contents( bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size ) # we still want an entry for this document in elastic so that, e.g., # the file counts from elastic are correct # these exceptions can happen for a variety of reasons (e.g. glacier # storage class, index event arrives after delete has occurred, etc.) # given how common they are, we shouldn't fail the batch for this except Exception as exc: # pylint: disable=broad-except text = "" logger_.warning("Content extraction failed %s %s %s", bucket, key, exc) do_index( s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, text=text, version_id=version_id ) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): logger_.warning("Skipping non-fatal exception: %s", boto_exc) continue logger_.critical("Failed record: %s, %s", event, boto_exc) raise boto_exc # flush the queue batch_processor.send_all()
def index_if_package( s3_client, doc_queue: DocumentQueue, *, bucket: str, etag: str, key: str, last_modified: str, version_id: Optional[str], ) -> bool: """index manifest pointer files as package documents in ES Returns: - True if pointer to manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if ( not pointer_file or not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle ): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False except ValueError as err: logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) def get_pkg_data(): try: package_hash = s3_client.get_object( Bucket=bucket, Key=key, )['Body'].read().decode() except botocore.exceptions.ClientError: return manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}' first = select_manifest_meta(s3_client, bucket, manifest_key) if not first: return stats = select_package_stats(s3_client, bucket, manifest_key) if not stats: return return { "key": key, "etag": etag, "version_id": version_id, "last_modified": last_modified, "delete_marker": False, # TODO: remove "handle": handle, "pointer_file": pointer_file, "hash": package_hash, "package_stats": stats, "metadata": json.dumps(first.get("user_meta", {})), "comment": str(first.get("message", "")), } data = get_pkg_data() or {} doc_queue.append_document({ "_index": bucket + PACKAGE_INDEX_SUFFIX, "_id": key, "_op_type": "index" if data else "delete", **data, }) return True
def index_if_package( s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, version_id: Optional[str], size: int ) -> bool: """index manifest pointer files as package documents in ES Returns: - True if pointer to manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if ( not pointer_file or not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle ): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) is_tag = False if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False except ValueError as err: is_tag = True logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) package_hash = '' first_dict = {} stats = None # we only need to get manifest contents for proper create events (not latest pointers) if event_type.startswith(EVENT_PREFIX["Created"]) and not is_tag: package_hash = get_plain_text( bucket, key, size, None, etag=etag, s3_client=s3_client, version_id=version_id, ).strip() manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}' first = select_manifest_meta(s3_client, bucket, manifest_key) stats = select_package_stats(s3_client, bucket, manifest_key) if not first: logger_.error("S3 select failed %s %s", bucket, manifest_key) return False try: first_dict = json.loads(first) except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc: print( f"{exc}\n" f"\tFailed to select first line of manifest s3://{bucket}/{key}." f"\tGot {first}." ) return False doc_queue.append( event_type, DocTypes.PACKAGE, bucket=bucket, etag=etag, ext=ext, handle=handle, key=key, last_modified=last_modified, # if we don't have the hash, we're processing a tag package_hash=(package_hash or pointer_file), package_stats=stats, pointer_file=pointer_file, comment=str(first_dict.get("message", "")), metadata=json.dumps(first_dict.get("user_meta", {})), version_id=version_id, ) return True
def handler(event, context): """enumerate S3 keys in event, extract relevant data and metadata, queue events, send to elastic via bulk() API """ # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: if body_message.get("Event") == TEST_EVENT: # Consume and ignore this event, which is an initial message from # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331 continue else: print("Unexpected message['body']. No 'Records' key.", message) raise Exception( "Unexpected message['body']. No 'Records' key.") batch_processor = DocumentQueue(context) events = body_message.get("Records", []) s3_client = make_s3_client() # event is a single S3 event for event_ in events: try: event_name = event_["eventName"] bucket = unquote(event_["s3"]["bucket"]["name"]) # In the grand tradition of IE6, S3 events turn spaces into '+' key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId") version_id = unquote(version_id) if version_id else None etag = unquote(event_["s3"]["object"]["eTag"]) ext = pathlib.PurePosixPath(key).suffix.lower() try: head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag) except botocore.exceptions.ClientError as exception: # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (exception.response.get('Error', {}).get('Code') == "403" and version_id == "null"): head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=None, etag=etag) else: raise exception size = head["ContentLength"] last_modified = head["LastModified"] meta = head["Metadata"] text = "" if event_name == OBJECT_DELETE: batch_processor.append(event_name, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=last_modified, text=text, version_id=version_id) continue text = get_contents(bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size) # decode Quilt-specific metadata if meta and "helium" in meta: try: decoded_helium = json.loads(meta["helium"]) meta["helium"] = decoded_helium or {} except (KeyError, json.JSONDecodeError): print("Unable to parse Quilt 'helium' metadata", meta) batch_processor.append(event_name, bucket=bucket, key=key, ext=ext, meta=meta, etag=etag, version_id=version_id, last_modified=last_modified, size=size, text=text) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): continue else: print("Fatal exception for record", event_, boto_exc) import traceback traceback.print_tb(boto_exc.__traceback__) raise boto_exc except Exception as exc: # pylint: disable=broad-except print("Fatal exception for record", event_, exc) import traceback traceback.print_tb(exc.__traceback__) raise exc # flush the queue batch_processor.send_all()