Ejemplo n.º 1
0
def get_image_bytes(
    submission_message: S3ImageSubmission,
    default_s3_bucket_image_prefix: str,
):
    """
    Takes a submission_message, identifies how best to get its bytes. Future
    work on re-using sessions for `requests` or any possible optimization must
    go here.

    Once we have more hashing lambdas that need access to media, this could be
    moved into its own module.
    """
    return S3BucketContentSource(submission_message.bucket,
                                 default_s3_bucket_image_prefix).get_bytes(
                                     submission_message.content_id)
Ejemplo n.º 2
0
    def get_preview_url(content_id, content_object) -> str:
        """
        Given a content_id and a content_object, returns a URL you can use to
        preview it.
        """
        content_object = t.cast(ContentObject, content_object)

        if content_object.content_ref_type == ContentRefType.DEFAULT_S3_BUCKET:
            source = S3BucketContentSource(image_bucket, image_prefix)

            preview_url = create_presigned_url(image_bucket,
                                               source.get_s3_key(content_id),
                                               None, 3600, "get_object")
        elif content_object.content_ref_type == ContentRefType.URL:
            preview_url = content_object.content_ref
        return preview_url
Ejemplo n.º 3
0
    def from_sqs_message(cls, d: dict,
                         image_prefix: str) -> "S3ImageSubmissionBatchMessage":
        result = []

        for s3_record in d["Records"]:
            bucket_name = s3_record["s3"]["bucket"]["name"]
            key = unquote_plus(s3_record["s3"]["object"]["key"])

            # Ignore Folders and Empty Files
            if s3_record["s3"]["object"]["size"] == 0:
                logger.info("Disregarding empty file or directory: %s", key)
                continue

            content_id = S3BucketContentSource.get_content_id_from_s3_key(
                key, image_prefix)
            result.append(S3ImageSubmission(content_id, bucket_name, key))

        return cls(image_submissions=result)
Ejemplo n.º 4
0
def get_submit_api(
    dynamodb_table: Table,
    image_bucket: str,
    image_prefix: str,
    submissions_queue_url: str,
    hash_queue_url: str,
) -> bottle.Bottle:
    """
    A Closure that includes all dependencies that MUST be provided by the root
    API that this API plugs into. Declare dependencies here, but initialize in
    the root API alone.
    """

    # A prefix to all routes must be provided by the api_root app
    # The documentation below expects prefix to be '/submit/'
    submit_api = bottle.Bottle()
    s3_bucket_image_source = S3BucketContentSource(image_bucket, image_prefix)

    def _content_exist_error(content_id: str):
        return bottle.abort(
            400,
            f"Content with id '{content_id}' already exists if you want to resubmit `force_resubmit=True` must be included in payload.",
        )

    def _record_content_submission_from_request(
        request: SubmitRequestBodyBase, ) -> bool:
        """
        Given a request object submission record the content object to the table passed to
        the API using 'record_content_submission'
        Note: this method does not store the content media itself.
        """

        content_ref, content_ref_type = request.get_content_ref_details()

        return record_content_submission(
            dynamodb_table,
            content_id=request.content_id,
            content_type=request.content_type,
            content_ref=content_ref,
            content_ref_type=content_ref_type,
            additional_fields=set(request.additional_fields)
            if request.additional_fields else set(),
            force_resubmit=request.force_resubmit,
        )

    @submit_api.post("/url/",
                     apply=[jsoninator(SubmitContentViaURLRequestBody)])
    def submit_url(
        request: SubmitContentViaURLRequestBody,
    ) -> t.Union[SubmitResponse, SubmitError]:
        """
        Submission via a url to content. This does not store a copy of the content in s3
        """
        if not _record_content_submission_from_request(request):
            return _content_exist_error(request.content_id)

        send_submission_to_url_queue(
            dynamodb_table,
            submissions_queue_url,
            request.content_id,
            request.content_type,
            request.content_url,
        )

        return SubmitResponse(content_id=request.content_id,
                              submit_successful=True)

    @submit_api.post("/bytes/",
                     apply=[jsoninator(SubmitContentBytesRequestBody)])
    def submit_bytes(
        request: SubmitContentBytesRequestBody,
    ) -> t.Union[SubmitResponse, SubmitError]:
        """
        Submit of media to HMA via a direct transfer of bytes to the system's s3 bucket.
        """
        content_id = request.content_id
        file_contents = base64.b64decode(request.content_bytes)

        # We want to record the submission before triggering and processing on
        # the content itself therefore we write to dynamodb before s3
        if not _record_content_submission_from_request(request):
            return _content_exist_error(request.content_id)

        s3_bucket_image_source.put_image_bytes(content_id, file_contents)

        return SubmitResponse(content_id=request.content_id,
                              submit_successful=True)

    @submit_api.post(
        "/put-url/",
        apply=[jsoninator(SubmitContentViaPutURLUploadRequestBody)])
    def submit_put_url(
        request: SubmitContentViaPutURLUploadRequestBody,
    ) -> t.Union[SubmitViaUploadUrlResponse, SubmitError]:
        """
        Submission of content to HMA in two steps
        1st the creation to a content record and put url based on request body
        2nd Upload to the system's s3 bucket by said put url returned by this method
        """
        presigned_url = create_presigned_put_url(
            bucket_name=image_bucket,
            key=s3_bucket_image_source.get_s3_key(request.content_id),
            file_type=request.file_type,
        )

        if presigned_url:
            if not _record_content_submission_from_request(request):
                return _content_exist_error(request.content_id)

            return SubmitViaUploadUrlResponse(
                content_id=request.content_id,
                file_type=str(request.file_type),
                presigned_url=presigned_url,
            )

        bottle.response.status = 400
        return SubmitError(
            content_id=request.content_id,
            message="Failed to generate upload url",
        )

    @submit_api.post("/hash/",
                     apply=[jsoninator(SubmitContentHashRequestBody)])
    def submit_hash(
        request: SubmitContentHashRequestBody,
    ) -> t.Union[SubmitResponse, SubmitError]:
        """
        Submission of a hash from a piece of content.
        Functions the same as other submission endpoint but skips
        the hasher and media storage.
        """

        # Record content object (even though we don't store anything just like with url)
        if not _record_content_submission_from_request(request):
            return _content_exist_error(request.content_id)

        # Record hash
        #   ToDo expand submit hash API to include `signal_specific_attributes`
        hash_record = PipelineHashRecord(
            content_id=request.content_id,
            signal_type=t.cast(t.Type[SignalType], request.signal_type),
            content_hash=request.signal_value,
            updated_at=datetime.datetime.now(),
        )
        hash_record.write_to_table(dynamodb_table)

        # Send hash directly to matcher
        # todo this could maybe try and reuse the methods in UnifiedHasher in #749
        _get_sqs_client().send_message(
            QueueUrl=hash_queue_url,
            MessageBody=json.dumps(hash_record.to_sqs_message()),
        )

        return SubmitResponse(content_id=request.content_id,
                              submit_successful=True)

    return submit_api
Ejemplo n.º 5
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    HMAConfig.initialize(HMA_CONFIG_TABLE)
    banks_table = BanksTable(
        get_dynamodb().Table(BANKS_TABLE),
        _get_signal_type_mapping(),
    )
    sqs_client = get_sqs_client()

    hasher = _get_hasher(_get_signal_type_mapping())

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage,
                                         BankSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        elif BankSubmissionMessage.could_be(message):
            media_to_process.append(
                BankSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                if isinstance(media, BankSubmissionMessage):
                    object_id = media.bank_id
                else:
                    object_id = media.content_id
                logger.warn(
                    f"Unprocessable content type: {media.content_type}, id: {object_id}"
                )
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                try:
                    if hasattr(media, "key") and hasattr(media, "bucket"):
                        # Classic duck-typing. If it has key and bucket, must be an
                        # S3 submission.
                        media = t.cast(S3ImageSubmission, media)
                        bytes_: bytes = S3BucketContentSource(
                            media.bucket,
                            IMAGE_PREFIX).get_bytes(media.content_id)
                    else:
                        media = t.cast(URLSubmissionMessage, media)
                        bytes_: bytes = URLContentSource().get_bytes(media.url)
                except Exception:
                    if isinstance(media, BankSubmissionMessage):
                        object_id = media.bank_id
                    else:
                        object_id = media.content_id
                    logger.exception(
                        f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content."
                    )
                    continue

            for signal in hasher.get_hashes(media.content_type, bytes_):
                if isinstance(media, BankSubmissionMessage):
                    # route signals to bank datastore only.
                    bank_operations.add_bank_member_signal(
                        banks_table=banks_table,
                        bank_id=media.bank_id,
                        bank_member_id=media.bank_member_id,
                        signal_type=signal.signal_type,
                        signal_value=signal.signal_value,
                    )
                    # don't write hash records etc.
                    continue

                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()
Ejemplo n.º 6
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    sqs_client = get_sqs_client()

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(message))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                logger.warn(
                    f"Unprocessable content type: {media.content_type}")
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                if hasattr(media, "key") and hasattr(media, "bucket"):
                    # Classic duck-typing. If it has key and bucket, must be an
                    # S3 submission.
                    bytes_: bytes = S3BucketContentSource(
                        media.bucket, IMAGE_PREFIX).get_bytes(media.content_id)
                else:
                    bytes_: bytes = URLContentSource().get_bytes(media.url)

            for signal in hasher.get_hashes(media.content_id,
                                            media.content_type, bytes_):
                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()