コード例 #1
0
def lambda_handler(event, context):
    """
    Runs on a schedule. On each run, gets all data files for
    ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index
    and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg. dipanjanm-hashing-<...>
    - the key name must be in the ThreatExchange folder (eg.
      threat_exchange_data/)
    - the key name must return a signal_type in
      ThreatUpdateS3Store.get_signal_type_from_object_key
    """
    # Note: even though we know which files were updated, threatexchange indexes
    # do not yet allow adding new entries. So, we must do a full rebuild. So, we
    # only end up using the signal types that were updated, not the actual files
    # that changed.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
    )

    banks_table = BanksTable(dynamodb.Table(BANKS_TABLE))

    for signal_type in ALL_INDEXABLE_SIGNAL_TYPES:
        adapter_class = _ADAPTER_MAPPING[signal_type]
        data_files = adapter_class(
            config=s3_config,
            metrics_logger=metrics.names.indexer).load_data()

        bank_data = get_all_bank_hash_rows(signal_type, banks_table)

        with metrics.timer(metrics.names.indexer.merge_datafiles):
            logger.info(f"Merging {signal_type} Hash files")

            # go from dict[filename, list<hash rows>] → list<hash rows>
            flattened_data = [
                hash_row for file_ in data_files.values() for hash_row in file_
            ]

            merged_data = functools.reduce(merge_hash_rows_on_hash_value,
                                           flattened_data + bank_data,
                                           {}).values()

        with metrics.timer(metrics.names.indexer.build_index):
            logger.info(f"Rebuilding {signal_type} Index")

            for index_class in INDEX_MAPPING[signal_type]:
                index: S3BackedInstrumentedIndexMixin = index_class.build(
                    merged_data)

                logger.info(
                    f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}"
                )
                index.save(bucket_name=INDEXES_BUCKET_NAME)
            metrics.flush()

    logger.info("Index updates complete")
コード例 #2
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg.
      dipanjanm-hashing-data20210224213427723700000003
    - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/)
    - the key name must be a pdq file ending in ".pdq.te"

    Which means adding new versions of the datasets will not have an effect. You
    must add the exact pdq.te file.
    """

    if not was_pdq_data_updated(event):
        logger.info("PDQ Data Not Updated, skipping")
        return

    logger.info("PDQ Data Updated, updating pdq hash index")
    metrics_logger = metrics.names.pdq_indexer_lambda

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION,
    )

    pdq_data_files = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics_logger).load_data()

    with metrics.timer(metrics_logger.merge_datafiles):
        logger.info("Merging PDQ Hash files")
        flat_pdq_data = [
            hash_row for pdq_file in pdq_data_files.values()
            for hash_row in pdq_file
        ]

        merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values()

    with metrics.timer(metrics_logger.build_index):
        logger.info("Creating PDQ Hash Index")
        index = PDQIndex.build(merged_pdq_data)

        logger.info("Putting index in S3")
        index_bytes = pickle.dumps(index)

    with metrics.timer(metrics_logger.upload_index):
        s3_client.put_object(Bucket=INDEXES_BUCKET_NAME,
                             Key=PDQ_INDEX_KEY,
                             Body=index_bytes)

    logger.info("Index update complete")
    metrics.flush()
コード例 #3
0
ファイル: api_root.py プロジェクト: tamanobi/ThreatExchange
def get_signal_hash_count() -> t.Dict[str, int]:
    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION,
    )
    pdq_storage = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics.names.api_hash_count())
    pdq_data_files = pdq_storage.load_data()

    return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}
コード例 #4
0
def _get_signal_hash_count_and_last_modified(
    threat_exchange_data_bucket_name: str,
    threat_exchange_data_folder: str,
    threat_exchange_pdq_file_extension: str,
) -> t.Dict[str, t.Tuple[int, str]]:
    # TODO this method is expensive some cache or memoization method might be a good idea.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=threat_exchange_data_bucket_name,
        threat_exchange_data_folder=threat_exchange_data_folder,
        threat_exchange_pdq_file_extension=threat_exchange_pdq_file_extension,
    )
    pdq_storage = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics.names.api_hash_count())
    pdq_data_files = pdq_storage.load_data()
    return {
        file_name: (len(rows), pdq_storage.last_modified[file_name])
        for file_name, rows in pdq_data_files.items()
    }
コード例 #5
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg. dipanjanm-hashing-<...>
    - the key name must be in the ThreatExchange folder (eg.
      threat_exchange_data/)
    - the key name must return a signal_type in
      ThreatUpdateS3Store.get_signal_type_from_object_key
    """
    updates = get_updated_files_by_signal_type(event)

    logger.info(updates)
    if not updates:
        logger.info("Signal Data Not Updated, skipping")
        return

    logger.info(
        f"Received updates for indicator_types: {','.join(map(lambda x: str(x), updates.keys()))}"
    )

    # Note: even though we know which files were updated, threatexchange indexes
    # do not yet allow adding new entries. So, we must do a full rebuild. So, we
    # only end up using the signal types that were updated, not the actual files
    # that changed.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension="PHASE_OUT_THIS_INTERNAL_DETAIL",
    )

    for updated_signal_type in updates.keys():
        adapter_class = _ADAPTER_MAPPING[updated_signal_type]
        data_files = adapter_class(
            config=s3_config,
            metrics_logger=metrics.names.indexer).load_data()

        with metrics.timer(metrics.names.indexer.merge_datafiles):
            logger.info(f"Merging {updated_signal_type} Hash files")
            flattened_data = [
                hash_row for file_ in data_files.values() for hash_row in file_
            ]

            merged_data = functools.reduce(merge_threat_exchange_files,
                                           flattened_data, {}).values()

        with metrics.timer(metrics.names.indexer.build_index):
            logger.info(f"Rebuilding {updated_signal_type} Index")
            index_class = INDEX_MAPPING[updated_signal_type]
            index: S3BackedInstrumentedIndexMixin = index_class.build(
                merged_data)

            logger.info(f"Putting {updated_signal_type} index in S3")
            index.save(bucket_name=INDEXES_BUCKET_NAME)
            metrics.flush()

    logger.info("Index updates complete")