Exemple #1
0
def select_package_stats(s3_client, bucket, manifest_key) -> str:
    """use s3 select to generate file stats for package"""
    logger_ = get_quilt_logger()
    try:
        raw_stats = query_manifest_content(
            s3_client,
            bucket=bucket,
            key=manifest_key,
            sql_stmt=SELECT_PACKAGE_STATS
        ).read()

        if raw_stats:
            stats = json.loads(raw_stats)
            assert isinstance(stats['total_bytes'], int)
            assert isinstance(stats['total_files'], int)

            return stats

    except (
            AssertionError,
            botocore.exceptions.ClientError,
            json.JSONDecodeError,
            KeyError,
    ) as err:
        logger_.exception("Unable to compute package stats via S3 select")

    return None
Exemple #2
0
 def _append_document(self, doc):
     """append well-formed documents (used for retry or by append())"""
     logger_ = get_quilt_logger()
     if doc.get("content"):
         # document text dominates memory footprint; OK to neglect the
         # small fixed size for the JSON metadata
         self.size += min(doc["size"], ELASTIC_LIMIT_BYTES)
     logger_.debug("Appending document %s", doc)
     self.queue.append(doc)
Exemple #3
0
    def append(
        self,
        *,
        bucket: str,
        key: str,
        etag: str,
        last_modified: str,
        size: int,
        text: str,
        event_type: str,
        ext: str,
        version_id,
    ):
        """format event as a document and then queue the document"""
        logger_ = get_quilt_logger()
        if not bucket or not key:
            raise ValueError(f"bucket={bucket} or key={key} required but missing")
        is_delete_marker = False
        if event_type.startswith(EVENT_PREFIX["Created"]):
            _op_type = "index"
        elif event_type.startswith(EVENT_PREFIX["Removed"]):
            _op_type = "delete"
            if event_type.endswith("DeleteMarkerCreated"):
                is_delete_marker = True
                # we index (not delete) delete markers to sync state with S3
                _op_type = "index"
        else:
            logger_.error("Skipping unrecognized event type %s", event_type)
            return
        # On types and fields, see
        # https://www.elastic.co/guide/en/elasticsearch/reference/master/mapping.html
        # Set common properties on the document
        # BE CAREFUL changing these values, as type changes or missing fields
        # can cause exceptions from ES
        # ensure the same versionId and primary keys (_id) as given by
        #  list-object-versions in the enterprise bulk_scanner
        version_id = version_id or "null"
        # core properties for all document types;
        # see https://elasticsearch-py.readthedocs.io/en/6.3.1/helpers.html
        body = {
            "_index": bucket,
            "_op_type": _op_type,  # determines if action is upsert (index) or delete
            "_id": get_id(key, version_id),
            "etag": etag,
            "key": key,
            "last_modified": last_modified,
            "size": size,
            "delete_marker": is_delete_marker,
            "version_id": version_id,
            "content": text,  # field for full-text search
            "event": event_type,
            "ext": ext,
            "updated": datetime.utcnow().isoformat(),
        }

        self.append_document(body)
Exemple #4
0
def get_time_remaining(context):
    """returns time remaining in seconds before lambda context is shut down"""
    logger_ = get_quilt_logger()
    time_remaining = floor(context.get_remaining_time_in_millis()/1000)
    if time_remaining < 30:
        logger_.warning(
            "Lambda function has {time_remaining} sec remaining. Reduce batch size?"
        )

    return time_remaining
Exemple #5
0
 def _filter_and_delete_packages(self, elastic):
     """handle package hard delete"""
     logger_ = get_quilt_logger()
     true_docs = []
     for doc in self.queue:
         pointer_file = doc.get("pointer_file")
         # handle hard package delete outside of the bulk operation
         if doc["_op_type"] == "delete" and pointer_file and not doc.get(
                 "delete_marker"):
             index = doc.get("_index")
             assert index.endswith(PACKAGE_INDEX_SUFFIX
                                   ), f"Refuse to delete non-package: {doc}"
             handle = doc.get("handle")
             assert handle, "Cannot delete package without handle"
             # no try/except because failure to delete, or trying to delete things
             # that aren't in ES, does not throw
             deletes = elastic.delete_by_query(
                 index=index,
                 body={
                     "query": {
                         "bool": {
                             "must": [
                                 # use match (not term) because some of these fields are analyzed
                                 {
                                     "match": {
                                         "handle": handle
                                     }
                                 },
                                 {
                                     "match": {
                                         "pointer_file": pointer_file
                                     }
                                 },
                                 {
                                     "match": {
                                         "delete_marker": False
                                     }
                                 },
                             ]
                         }
                     }
                 },
                 # we delete synchronously, so don't let it linger too long
                 timeout='20s')
             logger_.debug("Deleted %s stamped %s: %s", handle,
                           pointer_file, deletes)
             if not deletes.get("deleted"):
                 logger_.warning("Unable to delete: %s", doc)
         # send everything else to bulk()
         else:
             logger_.debug("Not filtering docs: %s", doc)
             true_docs.append(doc)
     # the queue is now everything we didn't delete by query above
     self.queue = true_docs
Exemple #6
0
def retry_s3(
        operation,
        bucket,
        key,
        size=None,
        limit=None,
        *,
        etag,
        version_id,
        s3_client
):
    """retry head or get operation to S3 with; stop before we run out of time.
    retry is necessary since, due to eventual consistency, we may not
    always get the required version of the object.
    """
    logger_ = get_quilt_logger()

    if operation == "head":
        function_ = s3_client.head_object
    elif operation == "get":
        function_ = s3_client.get_object
    else:
        raise ValueError(f"unexpected operation: {operation}")
    # Keyword arguments to function_
    arguments = {
        "Bucket": bucket,
        "Key": key
    }
    if operation == 'get' and size and limit:
        # can only request range if file is not empty
        arguments['Range'] = f"bytes=0-{min(size, limit) - 1}"
    if version_id:
        arguments['VersionId'] = version_id
    elif etag:
        arguments['IfMatch'] = etag

    logger_.debug("Entering @retry: %s, %s", operation, arguments)

    @retry(
        # debug
        reraise=True,
        stop=stop_after_attempt(MAX_RETRY),
        wait=wait_exponential(multiplier=2, min=4, max=10),
        retry=(retry_if_exception(should_retry_exception))
    )
    def call():
        """local function so we can set stop_after_delay dynamically"""
        # TODO: remove all this, stop_after_delay is not dynamically loaded anymore
        return function_(**arguments)

    return call()
Exemple #7
0
    def append_document(self, doc):
        """append well-formed documents (used for retry or by append())"""
        logger_ = get_quilt_logger()
        # This should be removed when we migrate to recent ES versions, see
        # https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html
        doc["_type"] = "_doc"
        # document text dominates memory footprint; OK to neglect the
        # small fixed size for the JSON metadata
        self.size += len(doc.get("content") or "")
        logger_.debug("Appending document %s", doc)
        self.queue.append(doc)

        if self.size >= QUEUE_LIMIT_BYTES:
            self.send_all()
Exemple #8
0
def do_index(
        s3_client,
        doc_queue: DocumentQueue,
        event_type: str,
        *,
        bucket: str,
        etag: str,
        ext: str,
        key: str,
        last_modified: str,
        text: str = '',
        size: int = 0,
        version_id: Optional[str] = None,
):
    """wrap dual indexing of packages and objects"""
    logger_ = get_quilt_logger()
    # index as object (always)
    logger_.debug("%s to indexing queue (%s)", key, event_type)
    doc_queue.append(
        event_type,
        DocTypes.OBJECT,
        bucket=bucket,
        ext=ext,
        etag=etag,
        key=key,
        last_modified=last_modified,
        size=size,
        text=text,
        version_id=version_id
    )
    # maybe index as package
    if index_if_package(
        s3_client,
        doc_queue,
        event_type,
        bucket=bucket,
        etag=etag,
        ext=ext,
        key=key,
        last_modified=last_modified,
        size=size,
        version_id=version_id,
    ):
        logger_.debug("%s indexed as package (%s)", key, event_type)
Exemple #9
0
def bulk_send(elastic, list_):
    """make a bulk() call to elastic"""
    logger_ = get_quilt_logger()
    logger_.debug("bulk_send(): %s", list_)
    return bulk(
        elastic,
        list_,
        # Some magic numbers to reduce memory pressure
        # e.g. see https://github.com/wagtail/wagtail/issues/4554
        chunk_size=100,  # max number of documents sent in one chunk
        # The stated default is max_chunk_bytes=10485760, but with default
        # ES will still return an exception stating that the very
        # same request size limit has been exceeded
        max_chunk_bytes=CHUNK_LIMIT_BYTES,
        # number of retries for 429 (too many requests only)
        # all other errors handled by our code
        max_retries=RETRY_429,
        # we'll process errors on our own
        raise_on_error=False,
        raise_on_exception=False)
Exemple #10
0
def shape_event(event: dict):
    """check event schema, return None if schema check fails"""
    logger_ = get_quilt_logger()

    try:
        validate(
            instance=event,
            schema=EVENT_SCHEMA,
            # format_checker= required for for format:date-time validation
            # (we also need strict-rfc3339 in requirements.txt)
            format_checker=draft7_format_checker,
        )
    except ValidationError as error:
        logger_.error("Invalid event format: %s\n%s", error, event)
        return None
    # be a good citizen and don't modify params
    return {
        **event,
        'eventName': map_event_name(event),
    }
Exemple #11
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data, queue events, send to
    elastic via bulk() API
    """
    logger_ = get_quilt_logger()
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    # An exception that we'll want to re-raise after the batch sends
    content_exception = None
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            if body_message.get("Event") == TEST_EVENT:
                logger_.debug("Skipping S3 Test Event")
                # Consume and ignore this event, which is an initial message from
                # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331
                continue
            print("Unexpected message['body']. No 'Records' key.", message)
            raise Exception("Unexpected message['body']. No 'Records' key.")
        batch_processor = DocumentQueue(context)
        events = body_message.get("Records", [])
        s3_client = make_s3_client()
        # event is a single S3 event
        for event_ in events:
            logger_.debug("Processing %s", event_)
            try:
                event_name = event_["eventName"]
                # Process all Create:* and Remove:* events
                if not any(
                        event_name.startswith(n)
                        for n in EVENT_PREFIX.values()):
                    continue
                bucket = unquote(event_["s3"]["bucket"]["name"])
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId")
                version_id = unquote(version_id) if version_id else None
                # Skip delete markers when versioning is on
                if version_id and event_name == "ObjectRemoved:DeleteMarkerCreated":
                    continue
                # ObjectRemoved:Delete does not include "eTag"
                etag = unquote(event_["s3"]["object"].get("eTag", ""))
                # Get two levels of extensions to handle files like .csv.gz
                path = pathlib.PurePosixPath(key)
                ext1 = path.suffix
                ext2 = path.with_suffix('').suffix
                ext = (ext2 + ext1).lower()

                # Handle delete first and then continue so that
                # head_object and get_object (below) don't fail
                if event_name.startswith(EVENT_PREFIX["Removed"]):
                    logger_.debug("Object delete to queue")
                    batch_processor.append(event_name,
                                           DocTypes.OBJECT,
                                           bucket=bucket,
                                           ext=ext,
                                           etag=etag,
                                           key=key,
                                           last_modified=now_like_boto3(),
                                           text="",
                                           version_id=version_id)
                    continue

                try:
                    logger_.debug("Get object head")
                    head = retry_s3("head",
                                    bucket,
                                    key,
                                    s3_client=s3_client,
                                    version_id=version_id,
                                    etag=etag)
                except botocore.exceptions.ClientError as first:
                    logger_.warning("head_object error: %s", first)
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (first.response.get('Error', {}).get('Code') == "403"
                            and version_id == "null"):
                        try:
                            head = retry_s3("head",
                                            bucket,
                                            key,
                                            s3_client=s3_client,
                                            version_id=None,
                                            etag=etag)
                        except botocore.exceptions.ClientError as second:
                            # this will bypass the DLQ but that's the right thing to do
                            # as some listed objects may NEVER succeed head requests
                            # (e.g. foreign owner) and there's no reason to torpedo
                            # the whole batch (which might include good files)
                            logger_.warning("Retried head_object error: %s",
                                            second)

                    logger_.error("Fatal head_object, skipping event: %s",
                                  event_)
                    continue

                size = head["ContentLength"]
                last_modified = head["LastModified"]

                did_index = index_if_manifest(s3_client,
                                              batch_processor,
                                              event_name,
                                              bucket=bucket,
                                              etag=etag,
                                              ext=ext,
                                              key=key,
                                              last_modified=last_modified,
                                              size=size,
                                              version_id=version_id)
                logger_.debug("Logged as manifest? %s", did_index)

                try:
                    text = maybe_get_contents(bucket,
                                              key,
                                              ext,
                                              etag=etag,
                                              version_id=version_id,
                                              s3_client=s3_client,
                                              size=size)
                # we still want an entry for this document in elastic so that, e.g.,
                # the file counts from elastic are correct. re-raise below.
                except Exception as exc:  # pylint: disable=broad-except
                    text = ""
                    content_exception = exc
                    logger_.error("Content extraction failed %s %s %s", bucket,
                                  key, exc)

                batch_processor.append(event_name,
                                       DocTypes.OBJECT,
                                       bucket=bucket,
                                       key=key,
                                       ext=ext,
                                       etag=etag,
                                       version_id=version_id,
                                       last_modified=last_modified,
                                       size=size,
                                       text=text)

            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    logger_.warning("Got exception but retrying: %s", boto_exc)
                    continue
                logger_.critical("Failed record: %s, %s", event, boto_exc)
                raise boto_exc
        # flush the queue
        batch_processor.send_all()
        # note: if there are multiple content exceptions in the batch, this will
        # only raise the most recent one;
        # re-raise so that get_contents() failures end up in the DLQ
        if content_exception:
            logger_.critical("Failed batch due to %s", content_exception)
            raise content_exception
Exemple #12
0
def index_if_manifest(s3_client, doc_queue: DocumentQueue, event_type: str, *,
                      bucket: str, etag: str, ext: str, key: str,
                      last_modified: str, version_id: Optional[str],
                      size: int) -> bool:
    """index manifest files as package documents in ES
        Returns:
            - True if manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3
            or '/' not in handle):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
    except ValueError as err:
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket,
                      key, err)
        # this is probably the latest pointer, skip it. manifest already indexed.
        return False
    else:
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket,
                            key)
            return False

    package_hash = get_plain_text(
        bucket,
        key,
        size,
        None,
        etag=etag,
        s3_client=s3_client,
        version_id=version_id,
    ).strip()

    manifest_key = f"{MANIFEST_PREFIX_V1}{package_hash}"
    first = select_manifest_meta(s3_client, bucket, manifest_key)
    stats = select_package_stats(s3_client, bucket, manifest_key)
    if not first:
        logger_.error("S3 select failed %s %s", bucket, manifest_key)
        return False
    try:
        first_dict = json.loads(first)
        doc_queue.append(
            event_type,
            DocTypes.PACKAGE,
            bucket=bucket,
            etag=etag,
            ext=ext,
            handle=handle,
            key=manifest_key,
            last_modified=last_modified,
            package_hash=package_hash,
            package_stats=stats,
            pointer_file=pointer_file,
            comment=str(first_dict.get("message", "")),
            metadata=json.dumps(first_dict.get("user_meta", {})),
        )
        return True
    except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc:
        print(f"{exc}\n"
              f"\tFailed to select first line of manifest s3://{bucket}/{key}."
              f"\tGot {first}.")
        return False
Exemple #13
0
        'meta': {
            'type': 'object',
        },
    },
    'required': ['logical_key', 'physical_key'],
}

s3 = boto3.client('s3')
lambda_ = boto3.client('lambda')

# Monkey patch quilt3 S3ClientProvider, so it builds a client using user credentials.
user_boto_session = None
quilt3.data_transfer.S3ClientProvider.get_boto_session = staticmethod(
    lambda: user_boto_session)

logger = get_quilt_logger()


def calculate_pkg_hashes(boto_session, pkg):
    entries = []
    for lk, entry in pkg.walk():
        if entry.hash is not None:
            continue
        if entry.size > S3_HASH_LAMBDA_MAX_FILE_SIZE_BYTES:
            raise FileTooLargeForHashing(lk)

        entries.append(entry)

    user_s3 = boto_session.client("s3")

    @functools.lru_cache(maxsize=None)
Exemple #14
0
    def append(
            self,
            event_type: str,
            doc_type: DocTypes,
            # properties unique to a document type are non-required kwargs
            ext: str = '',
            handle: str = '',
            metadata: str = '',
            pointer_file: str = '',
            # this could the hash OR tag; to be used in _id primary key
            package_hash: str = '',
            package_stats: Dict[str, int] = None,
            tags: List[str] = (),
            text: str = '',
            version_id=None,
            *,
            # common properties are required kwargs
            bucket: str,
            comment: str = '',
            key: str,
            etag: str,
            last_modified: str,
            size: int = 0
    ):
        """format event as a document and then queue the document"""
        logger_ = get_quilt_logger()
        if not bucket or not key:
            raise ValueError(f"bucket={bucket} or key={key} required but missing")
        is_delete_marker = False
        if event_type.startswith(EVENT_PREFIX["Created"]):
            _op_type = "index"
        elif event_type.startswith(EVENT_PREFIX["Removed"]):
            _op_type = "delete"
            if event_type.endswith("DeleteMarkerCreated"):
                is_delete_marker = True
                # we index (not delete) delete markers to sync state with S3
                _op_type = "index"
        else:
            logger_.error("Skipping unrecognized event type %s", event_type)
            return
        # On types and fields, see
        # https://www.elastic.co/guide/en/elasticsearch/reference/master/mapping.html
        # Set common properties on the document
        # BE CAREFUL changing these values, as type changes or missing fields
        # can cause exceptions from ES
        index_name = bucket
        if doc_type == DocTypes.PACKAGE:
            index_name += PACKAGE_INDEX_SUFFIX
        if not index_name:
            raise ValueError(f"Can't infer index name; bucket={bucket}, doc_type={doc_type}")
        # ensure the same versionId and primary keys (_id) as given by
        #  list-object-versions in the enterprise bulk_scanner
        version_id = version_id or "null"
        # core properties for all document types;
        # see https://elasticsearch-py.readthedocs.io/en/6.3.1/helpers.html
        body = {
            "_index": index_name,
            "_op_type": _op_type,  # determines if action is upsert (index) or delete
            # TODO remove this; it's not meaningful since we use a different index
            # type for object vs. package documents
            "_type": "_doc",
            "_id": get_id(key, version_id),
            # TODO nest fields under "document" and maybe use _type:{package, object}
            "comment": comment,
            "etag": etag,
            "key": key,
            "last_modified": last_modified,
            "size": size,
            "delete_marker": is_delete_marker,
            "version_id": version_id,
        }
        if doc_type == DocTypes.PACKAGE:
            if not handle:
                raise ValueError("missing required argument for package doc")
            if _op_type == "index":
                if not (pointer_file and package_hash):
                    raise ValueError("missing required argument for package doc")
            if not (
                package_stats is None
                or isinstance(package_stats, dict)
                and {'total_files', 'total_bytes'}.issubset(package_stats)
            ):
                raise ValueError("Malformed package_stats")
            body.update({
                "handle": handle,
                "hash": package_hash,
                "metadata": metadata,
                "pointer_file": pointer_file,
                "tags": ",".join(tags)
            })
            if package_stats:
                body.update({
                    "package_stats": package_stats,
                })
        elif doc_type == DocTypes.OBJECT:
            body.update({
                # TODO: remove this field from ES in /enterprise (now deprecated and unused)
                # here we explicitly drop the comment
                "comment": "",
                "content": text,  # field for full-text search
                "event": event_type,
                "ext": ext,
                "target": "",
                "updated": datetime.utcnow().isoformat(),
            })
        else:
            logger_.error("Skipping unexpected document type: %s", doc_type)

        self._append_document(body)

        if self.size >= QUEUE_LIMIT_BYTES:
            self.send_all()
Exemple #15
0
def index_if_package(
        s3_client,
        doc_queue: DocumentQueue,
        event_type: str,
        *,
        bucket: str,
        etag: str,
        ext: str,
        key: str,
        last_modified: str,
        version_id: Optional[str],
        size: int
) -> bool:
    """index manifest pointer files as package documents in ES
        Returns:
            - True if pointer to manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (
            not pointer_file
            or not pointer_prefix.startswith(POINTER_PREFIX_V1)
            or len(handle) < 3
            or '/' not in handle
    ):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
        is_tag = False
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key)
            return False
    except ValueError as err:
        is_tag = True
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err)

    package_hash = ''
    first_dict = {}
    stats = None
    # we only need to get manifest contents for proper create events (not latest pointers)
    if event_type.startswith(EVENT_PREFIX["Created"]) and not is_tag:
        package_hash = get_plain_text(
            bucket,
            key,
            size,
            None,
            etag=etag,
            s3_client=s3_client,
            version_id=version_id,
        ).strip()
        manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}'
        first = select_manifest_meta(s3_client, bucket, manifest_key)
        stats = select_package_stats(s3_client, bucket, manifest_key)
        if not first:
            logger_.error("S3 select failed %s %s", bucket, manifest_key)
            return False
        try:
            first_dict = json.loads(first)
        except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc:
            print(
                f"{exc}\n"
                f"\tFailed to select first line of manifest s3://{bucket}/{key}."
                f"\tGot {first}."
            )
            return False

    doc_queue.append(
        event_type,
        DocTypes.PACKAGE,
        bucket=bucket,
        etag=etag,
        ext=ext,
        handle=handle,
        key=key,
        last_modified=last_modified,
        # if we don't have the hash, we're processing a tag
        package_hash=(package_hash or pointer_file),
        package_stats=stats,
        pointer_file=pointer_file,
        comment=str(first_dict.get("message", "")),
        metadata=json.dumps(first_dict.get("user_meta", {})),
        version_id=version_id,
    )

    return True
Exemple #16
0
def index_if_package(
        s3_client,
        doc_queue: DocumentQueue,
        *,
        bucket: str,
        etag: str,
        key: str,
        last_modified: str,
        version_id: Optional[str],
) -> bool:
    """index manifest pointer files as package documents in ES
        Returns:
            - True if pointer to manifest (and passes to doc_queue for indexing)
            - False if not a manifest (no attempt at indexing)
    """
    logger_ = get_quilt_logger()
    pointer_prefix, pointer_file = split(key)
    handle = pointer_prefix[len(POINTER_PREFIX_V1):]
    if (
            not pointer_file
            or not pointer_prefix.startswith(POINTER_PREFIX_V1)
            or len(handle) < 3
            or '/' not in handle
    ):
        logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key)
        return False
    try:
        manifest_timestamp = int(pointer_file)
        if not 1451631600 <= manifest_timestamp <= 1767250800:
            logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key)
            return False
    except ValueError as err:
        logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err)

    def get_pkg_data():
        try:
            package_hash = s3_client.get_object(
                Bucket=bucket,
                Key=key,
            )['Body'].read().decode()
        except botocore.exceptions.ClientError:
            return

        manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}'
        first = select_manifest_meta(s3_client, bucket, manifest_key)
        if not first:
            return
        stats = select_package_stats(s3_client, bucket, manifest_key)
        if not stats:
            return

        return {
            "key": key,
            "etag": etag,
            "version_id": version_id,
            "last_modified": last_modified,
            "delete_marker": False,  # TODO: remove
            "handle": handle,
            "pointer_file": pointer_file,
            "hash": package_hash,
            "package_stats": stats,
            "metadata": json.dumps(first.get("user_meta", {})),
            "comment": str(first.get("message", "")),
        }

    data = get_pkg_data() or {}
    doc_queue.append_document({
        "_index": bucket + PACKAGE_INDEX_SUFFIX,
        "_id": key,
        "_op_type": "index" if data else "delete",
        **data,
    })

    return True
Exemple #17
0
def handler(event, context):
    """enumerate S3 keys in event, extract relevant data, queue events, send to
    elastic via bulk() API
    """
    logger_ = get_quilt_logger()
    # message is a proper SQS message, which either contains a single event
    # (from the bucket notification system) or batch-many events as determined
    # by enterprise/**/bulk_loader.py
    # An exception that we'll want to re-raise after the batch sends
    content_exception = None
    batch_processor = DocumentQueue(context)
    s3_client = make_s3_client()
    for message in event["Records"]:
        body = json.loads(message["body"])
        body_message = json.loads(body["Message"])
        if "Records" not in body_message:
            # could be TEST_EVENT, or another unexpected event; skip it
            logger_.error("No 'Records' key in message['body']: %s", message)
            continue
        events = body_message["Records"]
        # event is a single S3 event
        for event_ in events:
            validated = shape_event(event_)
            if not validated:
                logger_.debug("Skipping invalid event %s", event_)
                continue
            event_ = validated
            logger_.debug("Processing %s", event_)
            try:
                event_name = event_["eventName"]
                # Process all Create:* and Remove:* events
                if not any(event_name.startswith(n) for n in EVENT_PREFIX.values()):
                    logger_.warning("Skipping unknown event type: %s", event_name)
                    continue
                bucket = event_["s3"]["bucket"]["name"]
                # In the grand tradition of IE6, S3 events turn spaces into '+'
                # TODO: check if eventbridge events do the same thing with +
                key = unquote_plus(event_["s3"]["object"]["key"])
                version_id = event_["s3"]["object"].get("versionId", None)
                # ObjectRemoved:Delete does not include "eTag"
                etag = event_["s3"]["object"].get("eTag", "")
                # synthetic events from bulk scanner might define lastModified
                last_modified = (
                    event_["s3"]["object"].get("lastModified") or event_["eventTime"]
                )
                # Get two levels of extensions to handle files like .csv.gz
                path = pathlib.PurePosixPath(key)
                ext1 = path.suffix
                ext2 = path.with_suffix('').suffix
                ext = (ext2 + ext1).lower()
                # Handle delete and deletemarker first and then continue so that
                # head_object and get_object (below) don't fail
                if event_name.startswith(EVENT_PREFIX["Removed"]):
                    do_index(
                        s3_client,
                        batch_processor,
                        event_name,
                        bucket=bucket,
                        etag=etag,
                        ext=ext,
                        key=key,
                        last_modified=last_modified,
                        version_id=version_id
                    )
                    continue
                try:
                    head = retry_s3(
                        "head",
                        bucket,
                        key,
                        s3_client=s3_client,
                        version_id=version_id,
                        etag=etag
                    )
                except botocore.exceptions.ClientError as first:
                    logger_.warning("head_object error: %s", first)
                    # "null" version sometimes results in 403s for buckets
                    # that have changed versioning, retry without it
                    if (first.response.get('Error', {}).get('Code') == "403"
                            and version_id == "null"):
                        try:
                            head = retry_s3(
                                "head",
                                bucket,
                                key,
                                s3_client=s3_client,
                                version_id=None,
                                etag=etag
                            )
                        except botocore.exceptions.ClientError as second:
                            # this will bypass the DLQ but that's the right thing to do
                            # as some listed objects may NEVER succeed head requests
                            # (e.g. foreign owner) and there's no reason to torpedo
                            # the whole batch (which might include good files)
                            logger_.warning("Retried head_object error: %s", second)
                    logger_.error("Fatal head_object, skipping event: %s", event_)
                    continue
                # backfill fields based on the head_object
                size = head["ContentLength"]
                last_modified = last_modified or head["LastModified"].isoformat()
                etag = head.get("etag") or etag
                version_id = head.get("VersionId") or version_id
                try:
                    text = maybe_get_contents(
                        bucket,
                        key,
                        ext,
                        etag=etag,
                        version_id=version_id,
                        s3_client=s3_client,
                        size=size
                    )
                # we still want an entry for this document in elastic so that, e.g.,
                # the file counts from elastic are correct
                # these exceptions can happen for a variety of reasons (e.g. glacier
                # storage class, index event arrives after delete has occurred, etc.)
                # given how common they are, we shouldn't fail the batch for this
                except Exception as exc:  # pylint: disable=broad-except
                    text = ""
                    logger_.warning("Content extraction failed %s %s %s", bucket, key, exc)

                do_index(
                    s3_client,
                    batch_processor,
                    event_name,
                    bucket=bucket,
                    etag=etag,
                    ext=ext,
                    key=key,
                    last_modified=last_modified,
                    size=size,
                    text=text,
                    version_id=version_id
                )

            except botocore.exceptions.ClientError as boto_exc:
                if not should_retry_exception(boto_exc):
                    logger_.warning("Skipping non-fatal exception: %s", boto_exc)
                    continue
                logger_.critical("Failed record: %s, %s", event, boto_exc)
                raise boto_exc
    # flush the queue
    batch_processor.send_all()
Exemple #18
0
def maybe_get_contents(bucket, key, ext, *, etag, version_id, s3_client, size):
    """get the byte contents of a file if it's a target for deep indexing"""
    logger_ = get_quilt_logger()

    if ext.endswith('.gz'):
        compression = 'gz'
        ext = ext[:-len('.gz')]
    else:
        compression = None
    logger_.debug(
        "Entering maybe_get_contents (could run out of mem.) %s %s %s", bucket, key, version_id
    )
    content = ""
    inferred_ext = infer_extensions(key, ext)
    if inferred_ext in get_content_index_extensions(bucket_name=bucket):
        def _get_obj():
            return retry_s3(
                "get",
                bucket,
                key,
                size,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id,
            )

        if inferred_ext == ".fcs":
            obj = _get_obj()
            body, info = extract_fcs(get_bytes(obj["Body"], compression), as_html=False)
            # be smart and just send column names to ES (instead of bloated full schema)
            # if this is not an HTML/catalog preview
            content = trim_to_bytes(f"{body}\n{info}", get_content_index_bytes(bucket_name=bucket))
        elif inferred_ext == ".ipynb":
            content = trim_to_bytes(
                # we have no choice but to fetch the entire notebook, because we
                # are going to parse it
                # warning: huge notebooks could spike memory here
                get_notebook_cells(
                    bucket,
                    key,
                    size,
                    compression,
                    etag=etag,
                    s3_client=s3_client,
                    version_id=version_id
                ),
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext == ".parquet":
            if size >= get_available_memory():
                print(f"{bucket}/{key} too large to deserialize; skipping contents")
                # at least index the key and other stats, but don't overrun memory
                # and fail indexing altogether
                return ""
            obj = _get_obj()
            body, info = extract_parquet(
                get_bytes(obj["Body"], compression),
                as_html=False,
                skip_rows=(inferred_ext in SKIP_ROWS_EXTS),
                max_bytes=get_content_index_bytes(bucket_name=bucket),
            )
            # be smart and just send column names to ES (instead of bloated full schema)
            # if this is not an HTML/catalog preview
            columns = ','.join(list(info['schema']['names']))
            content = trim_to_bytes(f"{columns}\n{body}", get_content_index_bytes(bucket_name=bucket))
        elif inferred_ext == ".pdf":
            obj = _get_obj()
            content = trim_to_bytes(
                extract_pdf(get_bytes(obj["Body"], compression)),
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext in (".xls", ".xlsx"):
            obj = _get_obj()
            body, _ = extract_excel(get_bytes(obj["Body"], compression), as_html=False)
            content = trim_to_bytes(
                body,
                get_content_index_bytes(bucket_name=bucket),
            )
        elif inferred_ext == ".pptx":
            obj = _get_obj()
            content = extract_pptx(get_bytes(obj["Body"], compression), get_content_index_bytes(bucket_name=bucket))
        else:
            content = get_plain_text(
                bucket,
                key,
                size,
                compression,
                etag=etag,
                s3_client=s3_client,
                version_id=version_id
            )

    return content