Esempio n. 1
0
def lambda_handler(event, context):
    table = dynamodb.Table(DYNAMODB_TABLE)
    for sqs_record in event["Records"]:
        sns_notification = json.loads(sqs_record["body"])
        message = json.loads(sns_notification["Message"])
        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue
        for s3_record in message["Records"]:
            # Ignore Folders and Empty Files
            bucket_name = s3_record["s3"]["bucket"]["name"]
            key = unquote_plus(s3_record["s3"]["object"]["key"])
            if s3_record["s3"]["object"]["size"] == 0:
                logger.info("Disregarding empty file or directory: %s", key)
                continue
            logger.info("generating pdq hash for %s/%s", bucket_name, key)
            with tempfile.NamedTemporaryFile() as tmp_file:
                path = Path(tmp_file.name)
                s3_client.download_fileobj(bucket_name, key, tmp_file)
                pdq_hash, quality = pdq_hasher.pdq_from_file(path)

                save_hash_to_datastore(table, key, pdq_hash, quality,
                                       datetime.datetime.now())

                output = {"hash": pdq_hash, "type": "pdq", "key": key}
                logger.info("publishing new pdq hash")
                sqs_client.send_message(
                    QueueUrl=OUTPUT_QUEUE_URL,
                    MessageBody=json.dumps(output),
                )
Esempio n. 2
0
    def test_pdq_from_file(self):
        """ Writes a few bytes to a file and runs the pdq hasher on it. """
        with tempfile.NamedTemporaryFile("w+b") as f:
            f.write(base64.b64decode(RANDOM_IMAGE_BASE64))
            f.flush()

            pdq_hash = pdq_hasher.pdq_from_file(pathlib.Path(f.name))[0]
            assert pdq_hash == RANDOM_IMAGE_PDQ
Esempio n. 3
0
 def hash_from_file(cls, file: pathlib.Path) -> str:
     try:
         from threatexchange.hashing.pdq_hasher import pdq_from_file
     except:
         _raise_pillow_warning()
         return ""
     pdq_hash, _quality = pdq_from_file(file)
     return pdq_hash
Esempio n. 4
0
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: Lambdas have pretty strong tempfile storage limits (512MB as of this
    writing) [1]. We are using the tempfile module in a context manager block,
    so the file gets deleted after use. If additional files are created, ensure
    they are inside their own context managers otherwise the lambda can run out
    of disk-space.

    1: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html
    """

    records_table = dynamodb.Table(DYNAMODB_TABLE)
    store = HashStore(records_table)

    for sqs_record in event["Records"]:
        sns_notification = json.loads(sqs_record["body"])
        message = json.loads(sns_notification["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        for s3_record in message["Records"]:
            bucket_name = s3_record["s3"]["bucket"]["name"]
            key = unquote_plus(s3_record["s3"]["object"]["key"])

            # Ignore Folders and Empty Files
            if s3_record["s3"]["object"]["size"] == 0:
                logger.info("Disregarding empty file or directory: %s", key)
                continue

            logger.info("generating pdq hash for %s/%s", bucket_name, key)
            with tempfile.NamedTemporaryFile() as tmp_file:
                path = Path(tmp_file.name)
                s3_client.download_fileobj(bucket_name, key, tmp_file)
                pdq_hash, quality = pdq_hasher.pdq_from_file(path)
                hash_record = PDQHashRecord(key, pdq_hash, quality,
                                            datetime.datetime.now())

                # Add to dynamodb hash store
                store.add_hash(hash_record)

                # Publish to SQS queue
                sqs_client.send_message(
                    QueueUrl=OUTPUT_QUEUE_URL,
                    MessageBody=json.dumps(hash_record.to_sqs_message()),
                )

                logger.info("Published new PDQ hash")
Esempio n. 5
0
 def hash_file(cls, file: pathlib.Path) -> str:
     try:
         from threatexchange.hashing.pdq_hasher import pdq_from_file
     except:
         warnings.warn(
             "PDQ from file require Pillow and pdqhash to be installed; install threatexchange with the [pdq_hasher] extra to use them",
             category=UserWarning,
         )
         return ""
     pdq_hash, _quality = pdq_from_file(file)
     return pdq_hash
Esempio n. 6
0
 def match_file(self, file: pathlib.Path) -> t.List[signal_base.SignalMatch]:
     """Simple PDQ file match."""
     try:
         from threatexchange.hashing.pdq_hasher import pdq_from_file
     except:
         warnings.warn(
             "PDQ from file require Pillow and pdqhash to be installed; install threatexchange with the [pdq_hasher] extra to use them",
             category=UserWarning,
         )
         return []
     pdq_hash, quality = pdq_from_file(file)
     return self.match_hash(pdq_hash)
Esempio n. 7
0
def check_for_match_image():
    if request.method == "POST":
        uploaded_file = request.files["photo"]
        if uploaded_file.filename != "":
            file_path = os.path.join(
                config_helper.upload_folder, uploaded_file.filename
            )
            uploaded_file.save(file_path)

        with open(file_path, "rb") as f:
            hash, _ = pdq_from_file(file_path)
            matches = query_index(hash)
            return render_template(
                "results.html",
                result=matches,
                hash=hash,
                image=uploaded_file.filename,
            )
Esempio n. 8
0
    def hash_from_file(cls, file: pathlib.Path) -> str:
        pdq_hash, quality = pdq_from_file(file)
        ocr_text = text_from_image_file(file)

        return f"{pdq_hash},{ocr_text}"