Example #1
0
    def matches() -> MatchSummariesResponse:
        """
        Returns all, or a filtered list of matches.
        """
        signal_q = bottle.request.query.signal_q or None
        signal_source = bottle.request.query.signal_source or None
        content_q = bottle.request.query.content_q or None

        if content_q:
            records = PDQMatchRecord.get_from_content_id(
                dynamodb_table, content_q)
        elif signal_q:
            records = PDQMatchRecord.get_from_signal(dynamodb_table, signal_q,
                                                     signal_source or "")
        else:
            records = PDQMatchRecord.get_from_time_range(dynamodb_table)

        return MatchSummariesResponse(match_summaries=[
            MatchSummary(
                content_id=record.content_id[len(image_folder_key):],
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                updated_at=record.updated_at.isoformat(),
                reactions="Mocked",
            ) for record in records
        ])
Example #2
0
def get_match_details(content_id: str) -> t.List[MatchDetailsResult]:
    if not content_id:
        return []
    table = dynamodb.Table(DYNAMODB_TABLE)
    records = PDQMatchRecord.get_from_content_id(
        table, f"{IMAGE_FOLDER_KEY}{content_id}")
    # TODO these mocked metadata should either be added to
    # PDQMatchRecord or some other look up in the data model
    mocked_metadata = MatchDetailsMetadata(
        type="HASH_PDQ",
        tags=["mocked_t1", "mocked_t2"],
        status="MOCKED_STATUS",
        opinions=["mocked_a1", "mocked_a2"],
    )

    mocked_actions = ["Mocked_False_Postive", "Mocked_Delete"]
    return [{
        "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:],
        "content_hash": record.content_hash,
        "signal_id": record.signal_id,
        "signal_hash": record.signal_hash,
        "signal_source": record.signal_source,
        "updated_at": record.updated_at.isoformat(),
        "meta_data": mocked_metadata,
        "actions": mocked_actions,
    } for record in records]
Example #3
0
def gen_matches() -> t.List[MatchesResult]:
    table = dynamodb.Table(DYNAMODB_TABLE)
    records = PDQMatchRecord.get_from_time_range(table)
    return [{
        "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:],
        "signal_id": record.signal_id,
        "signal_source": record.signal_source,
        "updated_at": record.updated_at.isoformat(),
        "reactions": "TODO",
    } for record in records]
Example #4
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index stored in
    an S3 bucket and looks for a match

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                signal_id = metadata["id"]

                PDQMatchRecord(
                    key,
                    hash_str,
                    current_datetime,
                    signal_id,
                    metadata["source"],
                    metadata["hash"],
                ).write_to_table(records_table)

                match_ids.append(signal_id)
            sns_client.publish(
                TopicArn=OUTPUT_TOPIC_ARN,
                Subject="Match found in pdq_matcher lambda",
                Message=
                f"Match found for key: {key}, hash: {hash_str}, for IDs: {match_ids}",
            )
        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()
Example #5
0
def gen_match_details(content_id: str) -> t.List[MatchDetailsResult]:
    if not content_id:
        return []
    table = dynamodb.Table(DYNAMODB_TABLE)
    records = PDQMatchRecord.get_from_content_id(
        table, f"{IMAGE_FOLDER_KEY}{content_id}")
    return [{
        "content_id": record.content_id[IMAGE_FOLDER_KEY_LEN:],
        "content_hash": record.content_hash,
        "signal_id": record.signal_id,
        "signal_hash": record.signal_hash,
        "signal_source": record.signal_source,
        "updated_at": record.updated_at.isoformat(),
    } for record in records]
Example #6
0
def get_match_details(table: Table, content_id: str) -> t.List[MatchDetail]:
    if not content_id:
        return []

    records = PDQMatchRecord.get_from_content_id(table, f"{content_id}")

    return [
        MatchDetail(
            content_id=record.content_id,
            content_hash=record.content_hash,
            signal_id=record.signal_id,
            signal_hash=record.signal_hash,
            signal_source=record.signal_source,
            signal_type=record.SIGNAL_TYPE,
            updated_at=record.updated_at.isoformat(),
            metadata=get_signal_details(table, record.signal_id,
                                        record.signal_source),
        ) for record in records
    ]
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            matching_banked_signals: t.List[BankedSignal] = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                privacy_group_list = metadata.get("privacy_groups", [])
                metadata["privacy_groups"] = list(
                    filter(
                        lambda x: get_privacy_group_matcher_active(
                            str(x),
                            time.time() // CACHED_TIME,
                            # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds
                        ),
                        privacy_group_list,
                    ))
                if metadata["privacy_groups"]:
                    signal_id = str(metadata["id"])

                    with metrics.timer(metrics.names.pdq_matcher_lambda.
                                       write_match_record):
                        # TODO: Add source (threatexchange) tags to match record
                        PDQMatchRecord(
                            key,
                            hash_str,
                            current_datetime,
                            signal_id,
                            metadata["source"],
                            metadata["hash"],
                        ).write_to_table(records_table)

                    for pg in metadata.get("privacy_groups", []):
                        # Only update the metadata if it is not found in the table
                        # once intally created it is the fetcher's job to keep the item up to date
                        PDQSignalMetadata(
                            signal_id,
                            pg,
                            current_datetime,
                            metadata["source"],
                            metadata["hash"],
                            metadata["tags"].get(pg, []),
                        ).write_to_table_if_not_found(records_table)

                    match_ids.append(signal_id)

                    # TODO: change naming upstream and here from privacy_group[s]
                    # to dataset[s]
                    for privacy_group in metadata.get("privacy_groups", []):
                        banked_signal = BankedSignal(str(signal_id),
                                                     str(privacy_group),
                                                     str(metadata["source"]))
                        for tag in metadata["tags"].get(privacy_group, []):
                            banked_signal.add_classification(tag)
                        matching_banked_signals.append(banked_signal)

            # TODO: Add source (threatexchange) tags to match message
            if matching_banked_signals:
                match_message = MatchMessage(
                    content_key=key,
                    content_hash=hash_str,
                    matching_banked_signals=matching_banked_signals,
                )

                logger.info(f"Publishing match_message: {match_message}")

                # Publish one message for the set of matches.
                sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN,
                                   Message=match_message.to_aws_json())

        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()
Example #8
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be PDQ_INDEX_KEY

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME, PDQ_INDEX_KEY)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            for match in results:
                metadata = match.metadata
                logger.info("Match found for key: %s, hash %s -> %s", key,
                            hash_str, metadata)
                signal_id = metadata["id"]

                # TODO: Add source (threatexchange) tags to match record
                PDQMatchRecord(
                    key,
                    hash_str,
                    current_datetime,
                    signal_id,
                    metadata["source"],
                    metadata["hash"],
                ).write_to_table(records_table)

                match_ids.append(signal_id)

            # TODO: Add source (threatexchange) tags to match message
            message = MatchMessage(
                content_key=key,
                content_hash=hash_str,
                match_details=[
                    DatasetMatchDetails(banked_indicator_id=signal_id, )
                    for signal_id in match_ids
                ],
            )

            # Publish one message for the set of matches.
            sns_client.publish(TopicArn=OUTPUT_TOPIC_ARN,
                               Message=message.to_sns_message())
        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()