Esempio n. 1
0
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: Lambdas have pretty strong tempfile storage limits (512MB as of this
    writing) [1]. We are using the tempfile module in a context manager block,
    so the file gets deleted after use. If additional files are created, ensure
    they are inside their own context managers otherwise the lambda can run out
    of disk-space.

    1: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        sns_notification = json.loads(sqs_record["body"])
        message = json.loads(sns_notification["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        for s3_record in message["Records"]:
            bucket_name = s3_record["s3"]["bucket"]["name"]
            key = unquote_plus(s3_record["s3"]["object"]["key"])

            # Ignore Folders and Empty Files
            if s3_record["s3"]["object"]["size"] == 0:
                logger.info("Disregarding empty file or directory: %s", key)
                continue

            logger.info("generating pdq hash for %s/%s", bucket_name, key)

            with metrics.timer(metrics.names.pdq_hasher_lambda.download_file):
                bytes_: bytes = s3_client.get_object(Bucket=bucket_name,
                                                     Key=key)["Body"].read()

            with metrics.timer(metrics.names.pdq_hasher_lambda.hash):
                pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_)

            hash_record = PipelinePDQHashRecord(key, pdq_hash,
                                                datetime.datetime.now(),
                                                quality)

            hash_record.write_to_table(records_table)

            # Publish to SQS queue
            sqs_client.send_message(
                QueueUrl=OUTPUT_QUEUE_URL,
                MessageBody=json.dumps(hash_record.to_sqs_message()),
            )

            logger.info("Published new PDQ hash")

    metrics.flush()
Esempio n. 2
0
def worker():
    # just spend some time
    with timer(names.pdq_hasher_lambda.download_file):
        time.sleep(random.random() / 100.0)

    with timer(names.pdq_hasher_lambda.hash):
        time.sleep(random.random() / 100.0)
Esempio n. 3
0
    def load_data(self) -> t.Dict[str, t.List[HashRowT]]:
        """
        loads all data from all files in TE that are of the concrete implementations indicator type

        returns a mapping from file name to list of rows
        """
        logger.info("Retreiving %s Data from S3", self.file_type_str_name)
        with metrics.timer(self.metrics_logger.download_datafiles):
            # S3 doesnt have a built in concept of folders but the AWS UI
            # implements folder-like functionality using prefixes. We follow
            # this same convension here using folder name in a prefix search
            s3_bucket_files = s3_client.list_objects_v2(
                Bucket=self.config.threat_exchange_data_bucket_name,
                Prefix=self.config.threat_exchange_data_folder,
            )["Contents"]
            logger.info("Found %d Files", len(s3_bucket_files))

            typed_data_files = {
                file["Key"]: self._get_file(file["Key"])
                for file in s3_bucket_files
                if file["Key"].endswith(self.indicator_type_file_extension)
            }
            logger.info("Found %d %s Files", len(typed_data_files),
                        self.file_type_str_name)

        with metrics.timer(self.metrics_logger.parse_datafiles):
            logger.info("Parsing %s Hash files", self.file_type_str_name)
            typed_data = {
                file_name: self._parse_file(**typed_data_file)
                for file_name, typed_data_file in typed_data_files.items()
            }

        return typed_data
Esempio n. 4
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg.
      dipanjanm-hashing-data20210224213427723700000003
    - the key name must be threat_exchange_data/pdq.te

    Which means adding new versions of the datasets will not have an effect. You
    must add the exact pdq.te file.
    """

    if not was_pdq_data_updated(event):
        logger.info("PDQ Data Not Updated, skipping")
        return

    logger.info("PDQ Data Updated, updating pdq hash index")

    logger.info("Retreiving PDQ Data from S3")

    with metrics.timer(metrics.names.pdq_indexer_lambda.download_datafile):
        pdq_data_file = s3_client.get_object(
            Bucket=THREAT_EXCHANGE_DATA_BUCKET_NAME,
            Key=THREAT_EXCHANGE_PDQ_DATA_KEY)

    with metrics.timer(metrics.names.pdq_indexer_lambda.parse_datafile):
        pdq_data_reader = csv.DictReader(
            codecs.getreader("utf-8")(pdq_data_file["Body"]),
            fieldnames=PDQ_DATA_FILE_COLUMNS,
        )
        pdq_data = [
            (
                row["hash"],
                # Also add hash to metadata for easy look up on match
                {
                    "id": int(row["id"]),
                    "hash": row["hash"],
                },
            ) for row in pdq_data_reader
        ]

    with metrics.timer(metrics.names.pdq_indexer_lambda.build_index):
        logger.info("Creating PDQ Hash Index")

        index = PDQIndex.build(pdq_data)

        logger.info("Putting index in S3")
        index_bytes = pickle.dumps(index)

    with metrics.timer(metrics.names.pdq_indexer_lambda.upload_index):
        s3_client.put_object(Bucket=INDEXES_BUCKET_NAME,
                             Key=PDQ_INDEX_KEY,
                             Body=index_bytes)

    logger.info("Index update complete")
    metrics.flush()
Esempio n. 5
0
def lambda_handler(event, context):
    """
    Runs on a schedule. On each run, gets all data files for
    ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index
    and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg. dipanjanm-hashing-<...>
    - the key name must be in the ThreatExchange folder (eg.
      threat_exchange_data/)
    - the key name must return a signal_type in
      ThreatUpdateS3Store.get_signal_type_from_object_key
    """
    # Note: even though we know which files were updated, threatexchange indexes
    # do not yet allow adding new entries. So, we must do a full rebuild. So, we
    # only end up using the signal types that were updated, not the actual files
    # that changed.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
    )

    banks_table = BanksTable(dynamodb.Table(BANKS_TABLE))

    for signal_type in ALL_INDEXABLE_SIGNAL_TYPES:
        adapter_class = _ADAPTER_MAPPING[signal_type]
        data_files = adapter_class(
            config=s3_config,
            metrics_logger=metrics.names.indexer).load_data()

        bank_data = get_all_bank_hash_rows(signal_type, banks_table)

        with metrics.timer(metrics.names.indexer.merge_datafiles):
            logger.info(f"Merging {signal_type} Hash files")

            # go from dict[filename, list<hash rows>] → list<hash rows>
            flattened_data = [
                hash_row for file_ in data_files.values() for hash_row in file_
            ]

            merged_data = functools.reduce(merge_hash_rows_on_hash_value,
                                           flattened_data + bank_data,
                                           {}).values()

        with metrics.timer(metrics.names.indexer.build_index):
            logger.info(f"Rebuilding {signal_type} Index")

            for index_class in INDEX_MAPPING[signal_type]:
                index: S3BackedInstrumentedIndexMixin = index_class.build(
                    merged_data)

                logger.info(
                    f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}"
                )
                index.save(bucket_name=INDEXES_BUCKET_NAME)
            metrics.flush()

    logger.info("Index updates complete")
Esempio n. 6
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg.
      dipanjanm-hashing-data20210224213427723700000003
    - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/)
    - the key name must be a pdq file ending in ".pdq.te"

    Which means adding new versions of the datasets will not have an effect. You
    must add the exact pdq.te file.
    """

    if not was_pdq_data_updated(event):
        logger.info("PDQ Data Not Updated, skipping")
        return

    logger.info("PDQ Data Updated, updating pdq hash index")
    metrics_logger = metrics.names.pdq_indexer_lambda

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION,
    )

    pdq_data_files = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics_logger).load_data()

    with metrics.timer(metrics_logger.merge_datafiles):
        logger.info("Merging PDQ Hash files")
        flat_pdq_data = [
            hash_row for pdq_file in pdq_data_files.values()
            for hash_row in pdq_file
        ]

        merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values()

    with metrics.timer(metrics_logger.build_index):
        logger.info("Creating PDQ Hash Index")
        index = PDQIndex.build(merged_pdq_data)

        logger.info("Putting index in S3")
        index_bytes = pickle.dumps(index)

    with metrics.timer(metrics_logger.upload_index):
        s3_client.put_object(Bucket=INDEXES_BUCKET_NAME,
                             Key=PDQ_INDEX_KEY,
                             Body=index_bytes)

    logger.info("Index update complete")
    metrics.flush()
Esempio n. 7
0
def get_index(bucket_name, key):
    """
    Load the given index from the s3 bucket and deserialize it
    """
    # TODO Cache this index for a period of time to reduce S3 calls and bandwidth.
    with metrics.timer(metrics.names.pdq_matcher_lambda.download_index):
        with open(LOCAL_INDEX_FILENAME, "wb") as index_file:
            s3_client.download_fileobj(bucket_name, key, index_file)

    with metrics.timer(metrics.names.pdq_matcher_lambda.parse_index):
        result = pickle.load(open(LOCAL_INDEX_FILENAME, "rb"))

    return result
Esempio n. 8
0
 def write_hash_record(self, table: Table, hash_record: PipelineHashRecord):
     """
     Once a content signal has been created, write its corresponding hash
     record. These records can be used to do retroaction in case a new signal
     is obtained from sources.
     """
     with metrics.timer(metrics.names.hasher.write_record):
         hash_record.write_to_table(table)
Esempio n. 9
0
 def save(self, bucket_name: str):
     with metrics.timer(metrics.names.indexer.upload_index):
         index_file_bytes = pickle.dumps(self)
         get_s3_client().put_object(
             Bucket=bucket_name,
             Key=self.__class__._get_index_s3_key(),
             Body=index_file_bytes,
         )
Esempio n. 10
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index stored in
    an S3 bucket and looks for a match

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                signal_id = metadata["id"]

                PDQMatchRecord(
                    key,
                    hash_str,
                    current_datetime,
                    signal_id,
                    metadata["source"],
                    metadata["hash"],
                ).write_to_table(records_table)

                match_ids.append(signal_id)
            sns_client.publish(
                TopicArn=OUTPUT_TOPIC_ARN,
                Subject="Match found in pdq_matcher lambda",
                Message=
                f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}",
            )
        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()
Esempio n. 11
0
    def get_index(self, signal_type: t.Type[SignalType]) -> SignalTypeIndex:
        # If cached, return an index instance for the signal_type. If not, build
        # one, cache and return.
        if not signal_type in self._cached_indexes:
            index_cls = INDEX_MAPPING[signal_type]

            with metrics.timer(metrics.names.indexer.download_index):
                self._cached_indexes[signal_type] = index_cls.load(
                    bucket_name=self.index_bucket_name)

        return self._cached_indexes[signal_type]
Esempio n. 12
0
 def publish_hash_message(self, sqs_client: SQSClient,
                          hash_record: PipelineHashRecord):
     """
     Once you've written the hash record, publish a message to the matcher's
     input queue.
     """
     with metrics.timer(metrics.names.hasher.publish_message):
         sqs_client.send_message(
             QueueUrl=self.output_queue_url,
             MessageBody=json.dumps(hash_record.to_sqs_message()),
         )
Esempio n. 13
0
    def get_hashes(
        self, content_id: str, content_type: t.Type[ContentType], bytes_: bytes
    ) -> t.Generator[ContentSignal, None, None]:
        """
        Yields signals for content_type.
        """
        for signal_type in content_type.get_signal_types():
            if signal_type in self.supported_signal_types and issubclass(
                signal_type, BytesHasher
            ):
                with metrics.timer(metrics.names.hasher.hash(signal_type.get_name())):
                    hash_value = signal_type.hash_from_bytes(bytes_)

                yield ContentSignal(content_type, content_id, signal_type, hash_value)
Esempio n. 14
0
    def build_index_from_last_24h(cls, signal_type, storage_path, bucket_width) -> None:
        """Create an index"""
        with metrics.timer(metrics.names.lcc.get_data):
            d = timedelta(days=1)

            # Make 3 different metric.timers
            # get_Recrods, record_list, and .build
            past_day_content = TimeBucketizer.get_records(
                (datetime.now() - d),
                datetime.now(),
                signal_type,
                storage_path,
                bucket_width,
                HashRecord,
            )

        with metrics.timer(metrics.names.lcc.in_memory_processing):
            record_list = []
            for record in past_day_content:
                record_list.append((record.content_hash, record.content_id))

        with metrics.timer(metrics.names.lcc.build_index):
            return PDQIndex.build(record_list)
Esempio n. 15
0
    def match(self, signal_type: t.Type[SignalType],
              signal_value: str) -> t.List[IndexMatch]:
        """
        Returns MatchMessage which can be directly published to a queue.

        Note, this also filters out matches that are from datasets that have
        been de-activated.
        """
        index = self.get_index(signal_type)

        with metrics.timer(metrics.names.indexer.search_index):
            match_results: t.List[IndexMatch] = index.query(signal_value)

        if not match_results:
            # No matches found in the index
            return []

        return self.filter_match_results(match_results)
Esempio n. 16
0
    def get_hashes(self, content_type: t.Type[ContentType],
                   bytes_: bytes) -> t.Generator[ContentSignal, None, None]:
        """
        Yields signals for content_type.
        """
        for signal_type in content_type.get_signal_types():
            if signal_type in self.supported_signal_types and issubclass(
                    signal_type, BytesHasher):
                with metrics.timer(
                        metrics.names.hasher.hash(signal_type.get_name())):
                    try:
                        hash_value = signal_type.hash_from_bytes(bytes_)
                    except Exception:
                        logger.exception(
                            "Encountered exception while trying to hash_from_bytes. Unable to hash content."
                        )
                        continue

                yield ContentSignal(content_type, signal_type, hash_value)
Esempio n. 17
0
def lambda_handler(event, context):
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                te_id = metadata["id"]

                PDQMatchRecord(key, hash_str, current_datetime, te_id,
                               metadata["hash"]).write_to_table(records_table)

                match_ids.append(te_id)
            sns_client.publish(
                TopicArn=OUTPUT_TOPIC_ARN,
                Subject="Match found in pdq_matcher lambda",
                Message=
                f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}",
            )
        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()
Esempio n. 18
0
    def get_index(self, signal_type: t.Type[SignalType]) -> SignalTypeIndex:
        """
        If cached, return an index instance for the signal_type. If not, build
        one, cache and return.
        """

        max_custom_threshold = (
            get_max_threshold_of_active_privacy_groups_for_signal_type(
                signal_type))
        index_cls = self._get_index_for_signal_type_matching(
            signal_type, max_custom_threshold)

        # Check for signal_type in cache AND confirm said index class type is
        # still correct for the given [optional] max_custom_threshold
        if not signal_type in self._cached_indexes or not isinstance(
                self._cached_indexes[signal_type], index_cls):
            with metrics.timer(metrics.names.indexer.download_index):
                self._cached_indexes[signal_type] = index_cls.load(
                    bucket_name=self.index_bucket_name)

        return self._cached_indexes[signal_type]
Esempio n. 19
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    HMAConfig.initialize(HMA_CONFIG_TABLE)
    banks_table = BanksTable(
        get_dynamodb().Table(BANKS_TABLE),
        _get_signal_type_mapping(),
    )
    sqs_client = get_sqs_client()

    hasher = _get_hasher(_get_signal_type_mapping())

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage,
                                         BankSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        elif BankSubmissionMessage.could_be(message):
            media_to_process.append(
                BankSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                if isinstance(media, BankSubmissionMessage):
                    object_id = media.bank_id
                else:
                    object_id = media.content_id
                logger.warn(
                    f"Unprocessable content type: {media.content_type}, id: {object_id}"
                )
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                try:
                    if hasattr(media, "key") and hasattr(media, "bucket"):
                        # Classic duck-typing. If it has key and bucket, must be an
                        # S3 submission.
                        media = t.cast(S3ImageSubmission, media)
                        bytes_: bytes = S3BucketContentSource(
                            media.bucket,
                            IMAGE_PREFIX).get_bytes(media.content_id)
                    else:
                        media = t.cast(URLSubmissionMessage, media)
                        bytes_: bytes = URLContentSource().get_bytes(media.url)
                except Exception:
                    if isinstance(media, BankSubmissionMessage):
                        object_id = media.bank_id
                    else:
                        object_id = media.content_id
                    logger.exception(
                        f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content."
                    )
                    continue

            for signal in hasher.get_hashes(media.content_type, bytes_):
                if isinstance(media, BankSubmissionMessage):
                    # route signals to bank datastore only.
                    bank_operations.add_bank_member_signal(
                        banks_table=banks_table,
                        bank_id=media.bank_id,
                        bank_member_id=media.bank_member_id,
                        signal_type=signal.signal_type,
                        signal_value=signal.signal_value,
                    )
                    # don't write hash records etc.
                    continue

                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()
Esempio n. 20
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    sqs_client = get_sqs_client()

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(message))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                logger.warn(
                    f"Unprocessable content type: {media.content_type}")
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                if hasattr(media, "key") and hasattr(media, "bucket"):
                    # Classic duck-typing. If it has key and bucket, must be an
                    # S3 submission.
                    bytes_: bytes = S3BucketContentSource(
                        media.bucket, IMAGE_PREFIX).get_bytes(media.content_id)
                else:
                    bytes_: bytes = URLContentSource().get_bytes(media.url)

            for signal in hasher.get_hashes(media.content_id,
                                            media.content_type, bytes_):
                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()
Esempio n. 21
0
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    The SQS events would be from S3. URL only submissions are routed to
    hmalib.lambdas.hashing instead.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: The image is brought into memory and then handed over to the hasher.
    If you are hashing large images, you may need to increase the memory
    allocated to the lambda. Also remember that images that look small on disk
    (eg. low quality jpegs) still occupy a lot of space in memory. The
    pixel-size of the image is a better indicator of the space it will take in
    memory.
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        message_body = json.loads(sqs_record["body"])
        message = json.loads(message_body["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        images_to_process: t.List[t.Union[S3ImageSubmission]] = []

        if S3ImageSubmissionBatchMessage.could_be(message):
            images_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_FOLDER_KEY).image_submissions)
        else:
            logger.warn("PDQ Hahser could not process incoming message %s",
                        repr(message))

        for image in images_to_process:
            logger.info("Getting bytes for submission:  %s", repr(image))
            with metrics.timer(metrics.names.pdq_hasher_lambda.download_file):
                bytes_: bytes = get_image_bytes(image, IMAGE_FOLDER_KEY)

            logger.info("Generating PDQ hash for submission: %s", repr(image))

            with metrics.timer(metrics.names.pdq_hasher_lambda.hash):
                pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_)

            hash_record = PipelineHashRecord(
                image.content_id,
                PdqSignal,
                pdq_hash,
                datetime.datetime.now(),
                {"Quality": quality},
            )

            hash_record.write_to_table(records_table)

            # Publish to SQS queue
            sqs_client.send_message(
                QueueUrl=OUTPUT_QUEUE_URL,
                MessageBody=json.dumps(hash_record.to_legacy_sqs_message()),
            )

            logger.info("Published new PDQ hash")

    metrics.flush()
Esempio n. 22
0
 def load(cls, bucket_name: str):
     with metrics.timer(metrics.names.indexer.download_index):
         index_file_bytes = (get_s3_client().get_object(
             Bucket=bucket_name,
             Key=cls._get_index_s3_key())["Body"].read())
         return pickle.loads(index_file_bytes)
Esempio n. 23
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg.
      dipanjanm-hashing-data20210224213427723700000003
    - the key name must be threat_exchange_data/pdq.te

    Which means adding new versions of the datasets will not have an effect. You
    must add the exact pdq.te file.
    """

    if not was_pdq_data_updated(event):
        logger.info("PDQ Data Not Updated, skipping")
        return

    logger.info("PDQ Data Updated, updating pdq hash index")

    logger.info("Retreiving PDQ Data from S3")

    with metrics.timer(metrics.names.pdq_indexer_lambda.download_datafiles):
        # S3 doesnt have a built in concept of folders but the AWS UI
        # implements folder-like functionality using prefixes. We follow
        # this same convension here using folder name in a prefix search
        s3_bucket_files = s3_client.list_objects_v2(
            Bucket=THREAT_EXCHANGE_DATA_BUCKET_NAME,
            Prefix=THREAT_EXCHANGE_DATA_FOLDER,
        )["Contents"]
        logger.info("Found %d Files", len(s3_bucket_files))

        pdq_data_files = [
            get_pdq_file(file["Key"]) for file in s3_bucket_files
            if file["Key"].endswith(THREAT_EXCHANGE_PDQ_FILE_EXTENSION)
        ]
        logger.info("Found %d PDQ Files", len(pdq_data_files))

    with metrics.timer(metrics.names.pdq_indexer_lambda.parse_datafiles):
        logger.info("Parsing PDQ Hash files")
        pdq_data = [
            parse_pdq_file(**pdq_data_file) for pdq_data_file in pdq_data_files
        ]

    with metrics.timer(metrics.names.pdq_indexer_lambda.merge_datafiles):
        logger.info("Merging PDQ Hash files")
        flat_pdq_data = [
            hash_row for pdq_file in pdq_data for hash_row in pdq_file
        ]

        merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).items()

    with metrics.timer(metrics.names.pdq_indexer_lambda.build_index):
        logger.info("Creating PDQ Hash Index")
        index = PDQIndex.build(merged_pdq_data)

        logger.info("Putting index in S3")
        index_bytes = pickle.dumps(index)

    with metrics.timer(metrics.names.pdq_indexer_lambda.upload_index):
        s3_client.put_object(Bucket=INDEXES_BUCKET_NAME,
                             Key=PDQ_INDEX_KEY,
                             Body=index_bytes)

    logger.info("Index update complete")
    metrics.flush()
Esempio n. 24
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg. dipanjanm-hashing-<...>
    - the key name must be in the ThreatExchange folder (eg.
      threat_exchange_data/)
    - the key name must return a signal_type in
      ThreatUpdateS3Store.get_signal_type_from_object_key
    """
    updates = get_updated_files_by_signal_type(event)

    logger.info(updates)
    if not updates:
        logger.info("Signal Data Not Updated, skipping")
        return

    logger.info(
        f"Received updates for indicator_types: {','.join(map(lambda x: str(x), updates.keys()))}"
    )

    # Note: even though we know which files were updated, threatexchange indexes
    # do not yet allow adding new entries. So, we must do a full rebuild. So, we
    # only end up using the signal types that were updated, not the actual files
    # that changed.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension="PHASE_OUT_THIS_INTERNAL_DETAIL",
    )

    for updated_signal_type in updates.keys():
        adapter_class = _ADAPTER_MAPPING[updated_signal_type]
        data_files = adapter_class(
            config=s3_config,
            metrics_logger=metrics.names.indexer).load_data()

        with metrics.timer(metrics.names.indexer.merge_datafiles):
            logger.info(f"Merging {updated_signal_type} Hash files")
            flattened_data = [
                hash_row for file_ in data_files.values() for hash_row in file_
            ]

            merged_data = functools.reduce(merge_threat_exchange_files,
                                           flattened_data, {}).values()

        with metrics.timer(metrics.names.indexer.build_index):
            logger.info(f"Rebuilding {updated_signal_type} Index")
            index_class = INDEX_MAPPING[updated_signal_type]
            index: S3BackedInstrumentedIndexMixin = index_class.build(
                merged_data)

            logger.info(f"Putting {updated_signal_type} index in S3")
            index.save(bucket_name=INDEXES_BUCKET_NAME)
            metrics.flush()

    logger.info("Index updates complete")
Esempio n. 25
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                signal_id = metadata["id"]

                # TODO: Add source (threatexchange) tags to match record
                PDQMatchRecord(
                    key,
                    hash_str,
                    current_datetime,
                    signal_id,
                    metadata["source"],
                    metadata["hash"],
                ).write_to_table(records_table)

                match_ids.append(signal_id)

            # TODO: Add source (threatexchange) tags to match message
            message = MatchMessage(
                content_key=key,
                content_hash=hash_str,
                match_details=[
                    DatasetMatchDetails(banked_indicator_id=signal_id, )
                    for signal_id in match_ids
                ],
            )

            # Publish one message for the set of matches.
            sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN,
                               Message=message.to_sns_message())
        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()
Esempio n. 26
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            matching_banked_signals: t.List[BankedSignal] = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                privacy_group_list = metadata.get("privacy_groups", [])
                metadata["privacy_groups"] = list(
                    filter(
                        lambda x: get_privacy_group_matcher_active(
                            str(x),
                            time.time() // CACHED_TIME,
                            # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds
                        ),
                        privacy_group_list,
                    ))
                if metadata["privacy_groups"]:
                    signal_id = str(metadata["id"])

                    with metrics.timer(metrics.names.pdq_matcher_lambda.
                                       write_match_record):
                        # TODO: Add source (threatexchange) tags to match record
                        PDQMatchRecord(
                            key,
                            hash_str,
                            current_datetime,
                            signal_id,
                            metadata["source"],
                            metadata["hash"],
                        ).write_to_table(records_table)

                    for pg in metadata.get("privacy_groups", []):
                        # Only update the metadata if it is not found in the table
                        # once intally created it is the fetcher's job to keep the item up to date
                        PDQSignalMetadata(
                            signal_id,
                            pg,
                            current_datetime,
                            metadata["source"],
                            metadata["hash"],
                            metadata["tags"].get(pg, []),
                        ).write_to_table_if_not_found(records_table)

                    match_ids.append(signal_id)

                    # TODO: change naming upstream and here from privacy_group[s]
                    # to dataset[s]
                    for privacy_group in metadata.get("privacy_groups", []):
                        banked_signal = BankedSignal(str(signal_id),
                                                     str(privacy_group),
                                                     str(metadata["source"]))
                        for tag in metadata["tags"].get(privacy_group, []):
                            banked_signal.add_classification(tag)
                        matching_banked_signals.append(banked_signal)

            # TODO: Add source (threatexchange) tags to match message
            if matching_banked_signals:
                match_message = MatchMessage(
                    content_key=key,
                    content_hash=hash_str,
                    matching_banked_signals=matching_banked_signals,
                )

                logger.info(f"Publishing match_message: {match_message}")

                # Publish one message for the set of matches.
                sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN,
                                   Message=match_message.to_aws_json())

        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()