def lambda_handler(event, context): """ Listens to SQS events generated when new files are added to S3. Downloads files to temp-storage, generates PDQ hash and quality from the file. Saves hash output to dynamodb. Sends a message on an output queue. Note: Lambdas have pretty strong tempfile storage limits (512MB as of this writing) [1]. We are using the tempfile module in a context manager block, so the file gets deleted after use. If additional files are created, ensure they are inside their own context managers otherwise the lambda can run out of disk-space. 1: https://docs.aws.amazon.com/lambda/latest/dg/images-create.html """ records_table = dynamodb.Table(DYNAMODB_TABLE) for sqs_record in event["Records"]: sns_notification = json.loads(sqs_record["body"]) message = json.loads(sns_notification["Message"]) if message.get("Event") == "s3:TestEvent": logger.info("Disregarding S3 Test Event") continue for s3_record in message["Records"]: bucket_name = s3_record["s3"]["bucket"]["name"] key = unquote_plus(s3_record["s3"]["object"]["key"]) # Ignore Folders and Empty Files if s3_record["s3"]["object"]["size"] == 0: logger.info("Disregarding empty file or directory: %s", key) continue logger.info("generating pdq hash for %s/%s", bucket_name, key) with metrics.timer(metrics.names.pdq_hasher_lambda.download_file): bytes_: bytes = s3_client.get_object(Bucket=bucket_name, Key=key)["Body"].read() with metrics.timer(metrics.names.pdq_hasher_lambda.hash): pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_) hash_record = PipelinePDQHashRecord(key, pdq_hash, datetime.datetime.now(), quality) hash_record.write_to_table(records_table) # Publish to SQS queue sqs_client.send_message( QueueUrl=OUTPUT_QUEUE_URL, MessageBody=json.dumps(hash_record.to_sqs_message()), ) logger.info("Published new PDQ hash") metrics.flush()
def worker(): # just spend some time with timer(names.pdq_hasher_lambda.download_file): time.sleep(random.random() / 100.0) with timer(names.pdq_hasher_lambda.hash): time.sleep(random.random() / 100.0)
def load_data(self) -> t.Dict[str, t.List[HashRowT]]: """ loads all data from all files in TE that are of the concrete implementations indicator type returns a mapping from file name to list of rows """ logger.info("Retreiving %s Data from S3", self.file_type_str_name) with metrics.timer(self.metrics_logger.download_datafiles): # S3 doesnt have a built in concept of folders but the AWS UI # implements folder-like functionality using prefixes. We follow # this same convension here using folder name in a prefix search s3_bucket_files = s3_client.list_objects_v2( Bucket=self.config.threat_exchange_data_bucket_name, Prefix=self.config.threat_exchange_data_folder, )["Contents"] logger.info("Found %d Files", len(s3_bucket_files)) typed_data_files = { file["Key"]: self._get_file(file["Key"]) for file in s3_bucket_files if file["Key"].endswith(self.indicator_type_file_extension) } logger.info("Found %d %s Files", len(typed_data_files), self.file_type_str_name) with metrics.timer(self.metrics_logger.parse_datafiles): logger.info("Parsing %s Hash files", self.file_type_str_name) typed_data = { file_name: self._parse_file(**typed_data_file) for file_name, typed_data_file in typed_data_files.items() } return typed_data
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-data20210224213427723700000003 - the key name must be threat_exchange_data/pdq.te Which means adding new versions of the datasets will not have an effect. You must add the exact pdq.te file. """ if not was_pdq_data_updated(event): logger.info("PDQ Data Not Updated, skipping") return logger.info("PDQ Data Updated, updating pdq hash index") logger.info("Retreiving PDQ Data from S3") with metrics.timer(metrics.names.pdq_indexer_lambda.download_datafile): pdq_data_file = s3_client.get_object( Bucket=THREAT_EXCHANGE_DATA_BUCKET_NAME, Key=THREAT_EXCHANGE_PDQ_DATA_KEY) with metrics.timer(metrics.names.pdq_indexer_lambda.parse_datafile): pdq_data_reader = csv.DictReader( codecs.getreader("utf-8")(pdq_data_file["Body"]), fieldnames=PDQ_DATA_FILE_COLUMNS, ) pdq_data = [ ( row["hash"], # Also add hash to metadata for easy look up on match { "id": int(row["id"]), "hash": row["hash"], }, ) for row in pdq_data_reader ] with metrics.timer(metrics.names.pdq_indexer_lambda.build_index): logger.info("Creating PDQ Hash Index") index = PDQIndex.build(pdq_data) logger.info("Putting index in S3") index_bytes = pickle.dumps(index) with metrics.timer(metrics.names.pdq_indexer_lambda.upload_index): s3_client.put_object(Bucket=INDEXES_BUCKET_NAME, Key=PDQ_INDEX_KEY, Body=index_bytes) logger.info("Index update complete") metrics.flush()
def lambda_handler(event, context): """ Runs on a schedule. On each run, gets all data files for ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-<...> - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must return a signal_type in ThreatUpdateS3Store.get_signal_type_from_object_key """ # Note: even though we know which files were updated, threatexchange indexes # do not yet allow adding new entries. So, we must do a full rebuild. So, we # only end up using the signal types that were updated, not the actual files # that changed. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, ) banks_table = BanksTable(dynamodb.Table(BANKS_TABLE)) for signal_type in ALL_INDEXABLE_SIGNAL_TYPES: adapter_class = _ADAPTER_MAPPING[signal_type] data_files = adapter_class( config=s3_config, metrics_logger=metrics.names.indexer).load_data() bank_data = get_all_bank_hash_rows(signal_type, banks_table) with metrics.timer(metrics.names.indexer.merge_datafiles): logger.info(f"Merging {signal_type} Hash files") # go from dict[filename, list<hash rows>] → list<hash rows> flattened_data = [ hash_row for file_ in data_files.values() for hash_row in file_ ] merged_data = functools.reduce(merge_hash_rows_on_hash_value, flattened_data + bank_data, {}).values() with metrics.timer(metrics.names.indexer.build_index): logger.info(f"Rebuilding {signal_type} Index") for index_class in INDEX_MAPPING[signal_type]: index: S3BackedInstrumentedIndexMixin = index_class.build( merged_data) logger.info( f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}" ) index.save(bucket_name=INDEXES_BUCKET_NAME) metrics.flush() logger.info("Index updates complete")
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-data20210224213427723700000003 - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must be a pdq file ending in ".pdq.te" Which means adding new versions of the datasets will not have an effect. You must add the exact pdq.te file. """ if not was_pdq_data_updated(event): logger.info("PDQ Data Not Updated, skipping") return logger.info("PDQ Data Updated, updating pdq hash index") metrics_logger = metrics.names.pdq_indexer_lambda s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION, ) pdq_data_files = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics_logger).load_data() with metrics.timer(metrics_logger.merge_datafiles): logger.info("Merging PDQ Hash files") flat_pdq_data = [ hash_row for pdq_file in pdq_data_files.values() for hash_row in pdq_file ] merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values() with metrics.timer(metrics_logger.build_index): logger.info("Creating PDQ Hash Index") index = PDQIndex.build(merged_pdq_data) logger.info("Putting index in S3") index_bytes = pickle.dumps(index) with metrics.timer(metrics_logger.upload_index): s3_client.put_object(Bucket=INDEXES_BUCKET_NAME, Key=PDQ_INDEX_KEY, Body=index_bytes) logger.info("Index update complete") metrics.flush()
def get_index(bucket_name, key): """ Load the given index from the s3 bucket and deserialize it """ # TODO Cache this index for a period of time to reduce S3 calls and bandwidth. with metrics.timer(metrics.names.pdq_matcher_lambda.download_index): with open(LOCAL_INDEX_FILENAME, "wb") as index_file: s3_client.download_fileobj(bucket_name, key, index_file) with metrics.timer(metrics.names.pdq_matcher_lambda.parse_index): result = pickle.load(open(LOCAL_INDEX_FILENAME, "rb")) return result
def write_hash_record(self, table: Table, hash_record: PipelineHashRecord): """ Once a content signal has been created, write its corresponding hash record. These records can be used to do retroaction in case a new signal is obtained from sources. """ with metrics.timer(metrics.names.hasher.write_record): hash_record.write_to_table(table)
def save(self, bucket_name: str): with metrics.timer(metrics.names.indexer.upload_index): index_file_bytes = pickle.dumps(self) get_s3_client().put_object( Bucket=bucket_name, Key=self.__class__._get_index_s3_key(), Body=index_file_bytes, )
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) signal_id = metadata["id"] PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) match_ids.append(signal_id) sns_client.publish( TopicArn=OUTPUT_TOPIC_ARN, Subject="Match found in pdq_matcher lambda", Message= f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}", ) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()
def get_index(self, signal_type: t.Type[SignalType]) -> SignalTypeIndex: # If cached, return an index instance for the signal_type. If not, build # one, cache and return. if not signal_type in self._cached_indexes: index_cls = INDEX_MAPPING[signal_type] with metrics.timer(metrics.names.indexer.download_index): self._cached_indexes[signal_type] = index_cls.load( bucket_name=self.index_bucket_name) return self._cached_indexes[signal_type]
def publish_hash_message(self, sqs_client: SQSClient, hash_record: PipelineHashRecord): """ Once you've written the hash record, publish a message to the matcher's input queue. """ with metrics.timer(metrics.names.hasher.publish_message): sqs_client.send_message( QueueUrl=self.output_queue_url, MessageBody=json.dumps(hash_record.to_sqs_message()), )
def get_hashes( self, content_id: str, content_type: t.Type[ContentType], bytes_: bytes ) -> t.Generator[ContentSignal, None, None]: """ Yields signals for content_type. """ for signal_type in content_type.get_signal_types(): if signal_type in self.supported_signal_types and issubclass( signal_type, BytesHasher ): with metrics.timer(metrics.names.hasher.hash(signal_type.get_name())): hash_value = signal_type.hash_from_bytes(bytes_) yield ContentSignal(content_type, content_id, signal_type, hash_value)
def build_index_from_last_24h(cls, signal_type, storage_path, bucket_width) -> None: """Create an index""" with metrics.timer(metrics.names.lcc.get_data): d = timedelta(days=1) # Make 3 different metric.timers # get_Recrods, record_list, and .build past_day_content = TimeBucketizer.get_records( (datetime.now() - d), datetime.now(), signal_type, storage_path, bucket_width, HashRecord, ) with metrics.timer(metrics.names.lcc.in_memory_processing): record_list = [] for record in past_day_content: record_list.append((record.content_hash, record.content_id)) with metrics.timer(metrics.names.lcc.build_index): return PDQIndex.build(record_list)
def match(self, signal_type: t.Type[SignalType], signal_value: str) -> t.List[IndexMatch]: """ Returns MatchMessage which can be directly published to a queue. Note, this also filters out matches that are from datasets that have been de-activated. """ index = self.get_index(signal_type) with metrics.timer(metrics.names.indexer.search_index): match_results: t.List[IndexMatch] = index.query(signal_value) if not match_results: # No matches found in the index return [] return self.filter_match_results(match_results)
def get_hashes(self, content_type: t.Type[ContentType], bytes_: bytes) -> t.Generator[ContentSignal, None, None]: """ Yields signals for content_type. """ for signal_type in content_type.get_signal_types(): if signal_type in self.supported_signal_types and issubclass( signal_type, BytesHasher): with metrics.timer( metrics.names.hasher.hash(signal_type.get_name())): try: hash_value = signal_type.hash_from_bytes(bytes_) except Exception: logger.exception( "Encountered exception while trying to hash_from_bytes. Unable to hash content." ) continue yield ContentSignal(content_type, signal_type, hash_value)
def lambda_handler(event, context): records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) te_id = metadata["id"] PDQMatchRecord(key, hash_str, current_datetime, te_id, metadata["hash"]).write_to_table(records_table) match_ids.append(te_id) sns_client.publish( TopicArn=OUTPUT_TOPIC_ARN, Subject="Match found in pdq_matcher lambda", Message= f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}", ) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()
def get_index(self, signal_type: t.Type[SignalType]) -> SignalTypeIndex: """ If cached, return an index instance for the signal_type. If not, build one, cache and return. """ max_custom_threshold = ( get_max_threshold_of_active_privacy_groups_for_signal_type( signal_type)) index_cls = self._get_index_for_signal_type_matching( signal_type, max_custom_threshold) # Check for signal_type in cache AND confirm said index class type is # still correct for the given [optional] max_custom_threshold if not signal_type in self._cached_indexes or not isinstance( self._cached_indexes[signal_type], index_cls): with metrics.timer(metrics.names.indexer.download_index): self._cached_indexes[signal_type] = index_cls.load( bucket_name=self.index_bucket_name) return self._cached_indexes[signal_type]
def lambda_handler(event, context): """ SQS Events generated by the submissions API or by files being added to S3. Downloads files to temp-storage, identifies content_type and generates allowed signal_types from it. Saves hash output to DynamoDB, sends a message on an output queue. Note that this brings the contents of a file into memory. This is subject to the resource limitation on the lambda. Potentially extendable until 10GB, but that would be super-expensive. [1] [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html """ records_table = get_dynamodb().Table(DYNAMODB_TABLE) HMAConfig.initialize(HMA_CONFIG_TABLE) banks_table = BanksTable( get_dynamodb().Table(BANKS_TABLE), _get_signal_type_mapping(), ) sqs_client = get_sqs_client() hasher = _get_hasher(_get_signal_type_mapping()) for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "s3:TestEvent": continue media_to_process: t.List[t.Union[S3ImageSubmission, URLSubmissionMessage, BankSubmissionMessage]] = [] if URLSubmissionMessage.could_be(message): media_to_process.append( URLSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) elif S3ImageSubmissionBatchMessage.could_be(message): # S3 submissions can only be images for now. media_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_PREFIX).image_submissions) elif BankSubmissionMessage.could_be(message): media_to_process.append( BankSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) else: logger.warn(f"Unprocessable Message: {message}") for media in media_to_process: if not hasher.supports(media.content_type): if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.warn( f"Unprocessable content type: {media.content_type}, id: {object_id}" ) continue with metrics.timer(metrics.names.hasher.download_file): try: if hasattr(media, "key") and hasattr(media, "bucket"): # Classic duck-typing. If it has key and bucket, must be an # S3 submission. media = t.cast(S3ImageSubmission, media) bytes_: bytes = S3BucketContentSource( media.bucket, IMAGE_PREFIX).get_bytes(media.content_id) else: media = t.cast(URLSubmissionMessage, media) bytes_: bytes = URLContentSource().get_bytes(media.url) except Exception: if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.exception( f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content." ) continue for signal in hasher.get_hashes(media.content_type, bytes_): if isinstance(media, BankSubmissionMessage): # route signals to bank datastore only. bank_operations.add_bank_member_signal( banks_table=banks_table, bank_id=media.bank_id, bank_member_id=media.bank_member_id, signal_type=signal.signal_type, signal_value=signal.signal_value, ) # don't write hash records etc. continue hash_record = PipelineHashRecord( content_id=media.content_id, signal_type=signal.signal_type, content_hash=signal.signal_value, updated_at=datetime.datetime.now(), ) hasher.write_hash_record(records_table, hash_record) hasher.publish_hash_message(sqs_client, hash_record) metrics.flush()
def lambda_handler(event, context): """ SQS Events generated by the submissions API or by files being added to S3. Downloads files to temp-storage, identifies content_type and generates allowed signal_types from it. Saves hash output to DynamoDB, sends a message on an output queue. Note that this brings the contents of a file into memory. This is subject to the resource limitation on the lambda. Potentially extendable until 10GB, but that would be super-expensive. [1] [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html """ records_table = get_dynamodb().Table(DYNAMODB_TABLE) sqs_client = get_sqs_client() for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "s3:TestEvent": continue media_to_process: t.List[t.Union[S3ImageSubmission, URLSubmissionMessage]] = [] if URLSubmissionMessage.could_be(message): media_to_process.append( URLSubmissionMessage.from_sqs_message(message)) elif S3ImageSubmissionBatchMessage.could_be(message): # S3 submissions can only be images for now. media_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_PREFIX).image_submissions) else: logger.warn(f"Unprocessable Message: {message}") for media in media_to_process: if not hasher.supports(media.content_type): logger.warn( f"Unprocessable content type: {media.content_type}") continue with metrics.timer(metrics.names.hasher.download_file): if hasattr(media, "key") and hasattr(media, "bucket"): # Classic duck-typing. If it has key and bucket, must be an # S3 submission. bytes_: bytes = S3BucketContentSource( media.bucket, IMAGE_PREFIX).get_bytes(media.content_id) else: bytes_: bytes = URLContentSource().get_bytes(media.url) for signal in hasher.get_hashes(media.content_id, media.content_type, bytes_): hash_record = PipelineHashRecord( content_id=media.content_id, signal_type=signal.signal_type, content_hash=signal.signal_value, updated_at=datetime.datetime.now(), ) hasher.write_hash_record(records_table, hash_record) hasher.publish_hash_message(sqs_client, hash_record) metrics.flush()
def lambda_handler(event, context): """ Listens to SQS events generated when new files are added to S3. Downloads files to temp-storage, generates PDQ hash and quality from the file. The SQS events would be from S3. URL only submissions are routed to hmalib.lambdas.hashing instead. Saves hash output to dynamodb. Sends a message on an output queue. Note: The image is brought into memory and then handed over to the hasher. If you are hashing large images, you may need to increase the memory allocated to the lambda. Also remember that images that look small on disk (eg. low quality jpegs) still occupy a lot of space in memory. The pixel-size of the image is a better indicator of the space it will take in memory. """ records_table = dynamodb.Table(DYNAMODB_TABLE) for sqs_record in event["Records"]: message_body = json.loads(sqs_record["body"]) message = json.loads(message_body["Message"]) if message.get("Event") == "s3:TestEvent": logger.info("Disregarding S3 Test Event") continue images_to_process: t.List[t.Union[S3ImageSubmission]] = [] if S3ImageSubmissionBatchMessage.could_be(message): images_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_FOLDER_KEY).image_submissions) else: logger.warn("PDQ Hahser could not process incoming message %s", repr(message)) for image in images_to_process: logger.info("Getting bytes for submission: %s", repr(image)) with metrics.timer(metrics.names.pdq_hasher_lambda.download_file): bytes_: bytes = get_image_bytes(image, IMAGE_FOLDER_KEY) logger.info("Generating PDQ hash for submission: %s", repr(image)) with metrics.timer(metrics.names.pdq_hasher_lambda.hash): pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_) hash_record = PipelineHashRecord( image.content_id, PdqSignal, pdq_hash, datetime.datetime.now(), {"Quality": quality}, ) hash_record.write_to_table(records_table) # Publish to SQS queue sqs_client.send_message( QueueUrl=OUTPUT_QUEUE_URL, MessageBody=json.dumps(hash_record.to_legacy_sqs_message()), ) logger.info("Published new PDQ hash") metrics.flush()
def load(cls, bucket_name: str): with metrics.timer(metrics.names.indexer.download_index): index_file_bytes = (get_s3_client().get_object( Bucket=bucket_name, Key=cls._get_index_s3_key())["Body"].read()) return pickle.loads(index_file_bytes)
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-data20210224213427723700000003 - the key name must be threat_exchange_data/pdq.te Which means adding new versions of the datasets will not have an effect. You must add the exact pdq.te file. """ if not was_pdq_data_updated(event): logger.info("PDQ Data Not Updated, skipping") return logger.info("PDQ Data Updated, updating pdq hash index") logger.info("Retreiving PDQ Data from S3") with metrics.timer(metrics.names.pdq_indexer_lambda.download_datafiles): # S3 doesnt have a built in concept of folders but the AWS UI # implements folder-like functionality using prefixes. We follow # this same convension here using folder name in a prefix search s3_bucket_files = s3_client.list_objects_v2( Bucket=THREAT_EXCHANGE_DATA_BUCKET_NAME, Prefix=THREAT_EXCHANGE_DATA_FOLDER, )["Contents"] logger.info("Found %d Files", len(s3_bucket_files)) pdq_data_files = [ get_pdq_file(file["Key"]) for file in s3_bucket_files if file["Key"].endswith(THREAT_EXCHANGE_PDQ_FILE_EXTENSION) ] logger.info("Found %d PDQ Files", len(pdq_data_files)) with metrics.timer(metrics.names.pdq_indexer_lambda.parse_datafiles): logger.info("Parsing PDQ Hash files") pdq_data = [ parse_pdq_file(**pdq_data_file) for pdq_data_file in pdq_data_files ] with metrics.timer(metrics.names.pdq_indexer_lambda.merge_datafiles): logger.info("Merging PDQ Hash files") flat_pdq_data = [ hash_row for pdq_file in pdq_data for hash_row in pdq_file ] merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).items() with metrics.timer(metrics.names.pdq_indexer_lambda.build_index): logger.info("Creating PDQ Hash Index") index = PDQIndex.build(merged_pdq_data) logger.info("Putting index in S3") index_bytes = pickle.dumps(index) with metrics.timer(metrics.names.pdq_indexer_lambda.upload_index): s3_client.put_object(Bucket=INDEXES_BUCKET_NAME, Key=PDQ_INDEX_KEY, Body=index_bytes) logger.info("Index update complete") metrics.flush()
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-<...> - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must return a signal_type in ThreatUpdateS3Store.get_signal_type_from_object_key """ updates = get_updated_files_by_signal_type(event) logger.info(updates) if not updates: logger.info("Signal Data Not Updated, skipping") return logger.info( f"Received updates for indicator_types: {','.join(map(lambda x: str(x), updates.keys()))}" ) # Note: even though we know which files were updated, threatexchange indexes # do not yet allow adding new entries. So, we must do a full rebuild. So, we # only end up using the signal types that were updated, not the actual files # that changed. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension="PHASE_OUT_THIS_INTERNAL_DETAIL", ) for updated_signal_type in updates.keys(): adapter_class = _ADAPTER_MAPPING[updated_signal_type] data_files = adapter_class( config=s3_config, metrics_logger=metrics.names.indexer).load_data() with metrics.timer(metrics.names.indexer.merge_datafiles): logger.info(f"Merging {updated_signal_type} Hash files") flattened_data = [ hash_row for file_ in data_files.values() for hash_row in file_ ] merged_data = functools.reduce(merge_threat_exchange_files, flattened_data, {}).values() with metrics.timer(metrics.names.indexer.build_index): logger.info(f"Rebuilding {updated_signal_type} Index") index_class = INDEX_MAPPING[updated_signal_type] index: S3BackedInstrumentedIndexMixin = index_class.build( merged_data) logger.info(f"Putting {updated_signal_type} index in S3") index.save(bucket_name=INDEXES_BUCKET_NAME) metrics.flush() logger.info("Index updates complete")
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match. As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY When matched, publishes a notification to an SNS endpoint. Note this is in contrast with hasher and indexer. They publish to SQS directly. Publishing to SQS implies there can be only one consumer. Because, here, in the matcher, we publish to SNS, we can plug multiple queues behind it and profit! """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) signal_id = metadata["id"] # TODO: Add source (threatexchange) tags to match record PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) match_ids.append(signal_id) # TODO: Add source (threatexchange) tags to match message message = MatchMessage( content_key=key, content_hash=hash_str, match_details=[ DatasetMatchDetails(banked_indicator_id=signal_id, ) for signal_id in match_ids ], ) # Publish one message for the set of matches. sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN, Message=message.to_sns_message()) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match. As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY When matched, publishes a notification to an SNS endpoint. Note this is in contrast with hasher and indexer. They publish to SQS directly. Publishing to SQS implies there can be only one consumer. Because, here, in the matcher, we publish to SNS, we can plug multiple queues behind it and profit! """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] matching_banked_signals: t.List[BankedSignal] = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) privacy_group_list = metadata.get("privacy_groups", []) metadata["privacy_groups"] = list( filter( lambda x: get_privacy_group_matcher_active( str(x), time.time() // CACHED_TIME, # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds ), privacy_group_list, )) if metadata["privacy_groups"]: signal_id = str(metadata["id"]) with metrics.timer(metrics.names.pdq_matcher_lambda. write_match_record): # TODO: Add source (threatexchange) tags to match record PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) for pg in metadata.get("privacy_groups", []): # Only update the metadata if it is not found in the table # once intally created it is the fetcher's job to keep the item up to date PDQSignalMetadata( signal_id, pg, current_datetime, metadata["source"], metadata["hash"], metadata["tags"].get(pg, []), ).write_to_table_if_not_found(records_table) match_ids.append(signal_id) # TODO: change naming upstream and here from privacy_group[s] # to dataset[s] for privacy_group in metadata.get("privacy_groups", []): banked_signal = BankedSignal(str(signal_id), str(privacy_group), str(metadata["source"])) for tag in metadata["tags"].get(privacy_group, []): banked_signal.add_classification(tag) matching_banked_signals.append(banked_signal) # TODO: Add source (threatexchange) tags to match message if matching_banked_signals: match_message = MatchMessage( content_key=key, content_hash=hash_str, matching_banked_signals=matching_banked_signals, ) logger.info(f"Publishing match_message: {match_message}") # Publish one message for the set of matches. sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN, Message=match_message.to_aws_json()) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()