Exemple #1
0
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: Lambdas have pretty strong tempfile storage limits (512MB as of this
    writing) [1]. We are using the tempfile module in a context manager block,
    so the file gets deleted after use. If additional files are created, ensure
    they are inside their own context managers otherwise the lambda can run out
    of disk-space.

    1: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        sns_notification = json.loads(sqs_record["body"])
        message = json.loads(sns_notification["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        for s3_record in message["Records"]:
            bucket_name = s3_record["s3"]["bucket"]["name"]
            key = unquote_plus(s3_record["s3"]["object"]["key"])

            # Ignore Folders and Empty Files
            if s3_record["s3"]["object"]["size"] == 0:
                logger.info("Disregarding empty file or directory: %s", key)
                continue

            logger.info("generating pdq hash for %s/%s", bucket_name, key)

            with metrics.timer(metrics.names.pdq_hasher_lambda.download_file):
                bytes_: bytes = s3_client.get_object(Bucket=bucket_name,
                                                     Key=key)["Body"].read()

            with metrics.timer(metrics.names.pdq_hasher_lambda.hash):
                pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_)

            hash_record = PipelinePDQHashRecord(key, pdq_hash,
                                                datetime.datetime.now(),
                                                quality)

            hash_record.write_to_table(records_table)

            # Publish to SQS queue
            sqs_client.send_message(
                QueueUrl=OUTPUT_QUEUE_URL,
                MessageBody=json.dumps(hash_record.to_sqs_message()),
            )

            logger.info("Published new PDQ hash")

    metrics.flush()
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    The SQS events could be from S3 or directly from the Submission API lambdas
    in case of a URL submission.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: The image is brought into memory and then handed over to the hasher.
    If you are hashing large images, you may need to increase the memory
    allocated to the lambda. Also remember that images that look small on disk
    (eg. low quality jpegs) still occupy a lot of space in memory. The
    pixel-size of the image is a better indicator of the space it will take in
    memory.
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        message_body = json.loads(sqs_record["body"])
        message = json.loads(message_body["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        images_to_process: t.List[t.Union[URLImageSubmissionMessage,
                                          S3ImageSubmission]] = []

        if URLImageSubmissionMessage.could_be(message):
            images_to_process.append(
                URLImageSubmissionMessage.from_sqs_message(message))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            images_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_FOLDER_KEY).image_submissions)
        else:
            logger.warn("PDQ Hahser could not process incoming message %s",
                        repr(message))

        for image in images_to_process:
            logger.info("Getting bytes for submission:  %s", repr(image))
            with metrics.timer(metrics.names.pdq_hasher_lambda.download_file):
                bytes_: bytes = get_image_bytes(image, IMAGE_FOLDER_KEY)

            logger.info("Generating PDQ hash for submission: %s", repr(image))

            with metrics.timer(metrics.names.pdq_hasher_lambda.hash):
                pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_)

            hash_record = PipelinePDQHashRecord(image.content_id, pdq_hash,
                                                datetime.datetime.now(),
                                                quality)

            hash_record.write_to_table(records_table)

            # Publish to SQS queue
            sqs_client.send_message(
                QueueUrl=OUTPUT_QUEUE_URL,
                MessageBody=json.dumps(hash_record.to_sqs_message()),
            )

            logger.info("Published new PDQ hash")

    metrics.flush()