def test_matcher_filters_out_based_on_distance(self): with self.fresh_dynamodb(): self._init_data_if_required() match_1 = self._active_pg_match() match_2 = self._active_pg_match() match_2.distance = 100 matcher = Matcher("", [PdqSignal, VideoMD5Signal], self.table_manager) filtered_matches = matcher.filter_match_results( [match_1, match_2], PdqSignal ) self.assertEqual( len(filtered_matches), 1, "Failed to filter out match with distance > threshold", ) self.assertEqual( filtered_matches[0].distance, 0, "Filtered out the wrong match. Match with distance = 100 should be filtered out.", )
def _matches_for_hash(signal_type: t.Type[SignalType], signal_value: str) -> t.List[MatchesForHash]: matches = _get_matcher(indexes_bucket_name, banks_table=banks_table).match( signal_type, signal_value) match_objects: t.List[MatchesForHash] = [] # First get all threatexchange objects for match in matches: match_objects.extend([ MatchesForHash( match_distance=int(match.distance), matched_signal=signal_metadata, ) for signal_metadata in Matcher.get_te_metadata_objects_from_match(signal_type, match) ]) # now get all bank objects for match in matches: for metadata_obj in filter( lambda m: m.get_source() == BANKS_SOURCE_SHORT_CODE, match.metadata): metadata_obj = t.cast(BankedSignalIndexMetadata, metadata_obj) match_objects.append( MatchesForHash( match_distance=int(match.distance), matched_signal=banks_table.get_bank_member( metadata_obj.bank_member_id), )) return match_objects
def test_matcher_filters_out_based_on_bank_active(self): with self.fresh_dynamodb(): self._init_data_if_required() matcher = Matcher("", [PdqSignal, VideoMD5Signal], self.table_manager) filtered_matches = matcher.filter_match_results( [self._active_bank_match(), self._inactive_bank_match()], PdqSignal, ) self.assertEqual( len(filtered_matches), 1, "Failed to filter out inactive bank's match" ) self.assertEqual( filtered_matches[0].metadata[0].bank_member_id, self.active_bank_member.bank_member_id, "The filtered bank_member id is wrong. It should be the active bank's bank_member's id.", )
def test_matcher_filters_out_inactive_pg(self): with self.fresh_dynamodb(): self._init_data_if_required() matcher = Matcher("", [PdqSignal, VideoMD5Signal], self.table_manager) filtered_matches = matcher.filter_match_results( [self._active_pg_match(), self._inactive_pg_match()], PdqSignal, ) self.assertEqual( len(filtered_matches), 1, "Failed to filter out inactive pg match" ) self.assertEqual( filtered_matches[0].metadata[0].privacy_group, self.active_pg.privacy_group_id, "The filtered privacy group id is wrong. It should be the active pg's id.", )
def get_matcher(banks_table: BanksTable): global _matcher if _matcher is None: _matcher = Matcher( index_bucket_name=INDEXES_BUCKET_NAME, supported_signal_types=[PdqSignal, VideoMD5Signal], banks_table=banks_table, ) return _matcher
def _get_matcher(index_bucket_name: str, banks_table: BanksTable) -> Matcher: global _matcher if _matcher is None: _matcher = Matcher( index_bucket_name=index_bucket_name, supported_signal_types=[PdqSignal, VideoMD5Signal], banks_table=banks_table, ) return _matcher
def for_hash(request: MatchesForHashRequest) -> MatchesForHashResponse: """ For a given hash/signal check the index(es) for matches and return the details. This does not change system state, metadata returned will not be written any tables unlike when matches are found for submissions. """ matches = _get_matcher(indexes_bucket_name).match( request.signal_type, request.signal_value) match_objects: t.List[ThreatExchangeSignalMetadata] = [] for match in matches: match_objects.extend( Matcher.get_metadata_objects_from_match( request.signal_type, match)) return MatchesForHashResponse(match_objects, request.signal_value)
HMAConfig.initialize(HMA_CONFIG_TABLE) @functools.lru_cache(maxsize=None) def get_dynamodb() -> DynamoDBServiceResource: return boto3.resource("dynamodb") @functools.lru_cache(maxsize=None) def get_sns_client() -> SNSClient: return boto3.client("sns") matcher = Matcher( index_bucket_name=INDEXES_BUCKET_NAME, supported_signal_types=[PdqSignal, VideoMD5Signal], ) logger = get_logger(__name__) def lambda_handler(event, context): """ Listens to SQS events fired when new hash is generated. Loads the index stored in an S3 bucket and looks for a match. When matched, publishes a notification to an SNS endpoint. Note this is in contrast with hasher and indexer. They publish to SQS directly. Publishing to SQS implies there can be only one consumer. Because, here, in the matcher, we publish to SNS, we can plug multiple
def _get_matcher(index_bucket_name: str) -> Matcher: return Matcher( index_bucket_name=index_bucket_name, supported_signal_types=[PdqSignal, VideoMD5Signal], )