def matches() -> MatchSummariesResponse: """ Returns all, or a filtered list of matches. """ signal_q = bottle.request.query.signal_q or None signal_source = bottle.request.query.signal_source or None content_q = bottle.request.query.content_q or None if content_q: records = PDQMatchRecord.get_from_content_id( dynamodb_table, content_q) elif signal_q: records = PDQMatchRecord.get_from_signal(dynamodb_table, signal_q, signal_source or "") else: records = PDQMatchRecord.get_from_time_range(dynamodb_table) return MatchSummariesResponse(match_summaries=[ MatchSummary( content_id=record.content_id[len(image_folder_key):], signal_id=record.signal_id, signal_source=record.signal_source, updated_at=record.updated_at.isoformat(), reactions="Mocked", ) for record in records ])
def get_match_details(content_id: str) -> t.List[MatchDetailsResult]: if not content_id: return [] table = dynamodb.Table(DYNAMODB_TABLE) records = PDQMatchRecord.get_from_content_id( table, f"{IMAGE_FOLDER_KEY}{content_id}") # TODO these mocked metadata should either be added to # PDQMatchRecord or some other look up in the data model mocked_metadata = MatchDetailsMetadata( type="HASH_PDQ", tags=["mocked_t1", "mocked_t2"], status="MOCKED_STATUS", opinions=["mocked_a1", "mocked_a2"], ) mocked_actions = ["Mocked_False_Postive", "Mocked_Delete"] return [{ "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:], "content_hash": record.content_hash, "signal_id": record.signal_id, "signal_hash": record.signal_hash, "signal_source": record.signal_source, "updated_at": record.updated_at.isoformat(), "meta_data": mocked_metadata, "actions": mocked_actions, } for record in records]
def gen_matches() -> t.List[MatchesResult]: table = dynamodb.Table(DYNAMODB_TABLE) records = PDQMatchRecord.get_from_time_range(table) return [{ "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:], "signal_id": record.signal_id, "signal_source": record.signal_source, "updated_at": record.updated_at.isoformat(), "reactions": "TODO", } for record in records]
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) signal_id = metadata["id"] PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) match_ids.append(signal_id) sns_client.publish( TopicArn=OUTPUT_TOPIC_ARN, Subject="Match found in pdq_matcher lambda", Message= f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}", ) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()
def gen_match_details(content_id: str) -> t.List[MatchDetailsResult]: if not content_id: return [] table = dynamodb.Table(DYNAMODB_TABLE) records = PDQMatchRecord.get_from_content_id( table, f"{IMAGE_FOLDER_KEY}{content_id}") return [{ "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:], "content_hash": record.content_hash, "signal_id": record.signal_id, "signal_hash": record.signal_hash, "signal_source": record.signal_source, "updated_at": record.updated_at.isoformat(), } for record in records]
def get_match_details(table: Table, content_id: str) -> t.List[MatchDetail]: if not content_id: return [] records = PDQMatchRecord.get_from_content_id(table, f"{content_id}") return [ MatchDetail( content_id=record.content_id, content_hash=record.content_hash, signal_id=record.signal_id, signal_hash=record.signal_hash, signal_source=record.signal_source, signal_type=record.SIGNAL_TYPE, updated_at=record.updated_at.isoformat(), metadata=get_signal_details(table, record.signal_id, record.signal_source), ) for record in records ]
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match. As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY When matched, publishes a notification to an SNS endpoint. Note this is in contrast with hasher and indexer. They publish to SQS directly. Publishing to SQS implies there can be only one consumer. Because, here, in the matcher, we publish to SNS, we can plug multiple queues behind it and profit! """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] matching_banked_signals: t.List[BankedSignal] = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) privacy_group_list = metadata.get("privacy_groups", []) metadata["privacy_groups"] = list( filter( lambda x: get_privacy_group_matcher_active( str(x), time.time() // CACHED_TIME, # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds ), privacy_group_list, )) if metadata["privacy_groups"]: signal_id = str(metadata["id"]) with metrics.timer(metrics.names.pdq_matcher_lambda. write_match_record): # TODO: Add source (threatexchange) tags to match record PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) for pg in metadata.get("privacy_groups", []): # Only update the metadata if it is not found in the table # once intally created it is the fetcher's job to keep the item up to date PDQSignalMetadata( signal_id, pg, current_datetime, metadata["source"], metadata["hash"], metadata["tags"].get(pg, []), ).write_to_table_if_not_found(records_table) match_ids.append(signal_id) # TODO: change naming upstream and here from privacy_group[s] # to dataset[s] for privacy_group in metadata.get("privacy_groups", []): banked_signal = BankedSignal(str(signal_id), str(privacy_group), str(metadata["source"])) for tag in metadata["tags"].get(privacy_group, []): banked_signal.add_classification(tag) matching_banked_signals.append(banked_signal) # TODO: Add source (threatexchange) tags to match message if matching_banked_signals: match_message = MatchMessage( content_key=key, content_hash=hash_str, matching_banked_signals=matching_banked_signals, ) logger.info(f"Publishing match_message: {match_message}") # Publish one message for the set of matches. sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN, Message=match_message.to_aws_json()) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()
def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match. As per the default configuration - the index data bucket is INDEXES_BUCKET_NAME - the key name must be PDQ_INDEX_KEY When matched, publishes a notification to an SNS endpoint. Note this is in contrast with hasher and indexer. They publish to SQS directly. Publishing to SQS implies there can be only one consumer. Because, here, in the matcher, we publish to SNS, we can plug multiple queues behind it and profit! """ records_table = dynamodb.Table(DYNAMODB_TABLE) hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY) logger.info("loaded_hash_index") for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "TestEvent": logger.info("Disregarding Test Event") continue hash_str = message["hash"] key = message["key"] current_datetime = datetime.datetime.now() with metrics.timer(metrics.names.pdq_matcher_lambda.search_index): results = hash_index.query(hash_str) if results: match_ids = [] for match in results: metadata = match.metadata logger.info("Match found for key: %s, hash %s -> %s", key, hash_str, metadata) signal_id = metadata["id"] # TODO: Add source (threatexchange) tags to match record PDQMatchRecord( key, hash_str, current_datetime, signal_id, metadata["source"], metadata["hash"], ).write_to_table(records_table) match_ids.append(signal_id) # TODO: Add source (threatexchange) tags to match message message = MatchMessage( content_key=key, content_hash=hash_str, match_details=[ DatasetMatchDetails(banked_indicator_id=signal_id, ) for signal_id in match_ids ], ) # Publish one message for the set of matches. sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN, Message=message.to_sns_message()) else: logger.info(f"No matches found for key: {key} hash: {hash_str}") metrics.flush()