def lambda_handler(event, context): """ Runs on a schedule. On each run, gets all data files for ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-<...> - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must return a signal_type in ThreatUpdateS3Store.get_signal_type_from_object_key """ # Note: even though we know which files were updated, threatexchange indexes # do not yet allow adding new entries. So, we must do a full rebuild. So, we # only end up using the signal types that were updated, not the actual files # that changed. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, ) banks_table = BanksTable(dynamodb.Table(BANKS_TABLE)) for signal_type in ALL_INDEXABLE_SIGNAL_TYPES: adapter_class = _ADAPTER_MAPPING[signal_type] data_files = adapter_class( config=s3_config, metrics_logger=metrics.names.indexer).load_data() bank_data = get_all_bank_hash_rows(signal_type, banks_table) with metrics.timer(metrics.names.indexer.merge_datafiles): logger.info(f"Merging {signal_type} Hash files") # go from dict[filename, list<hash rows>] → list<hash rows> flattened_data = [ hash_row for file_ in data_files.values() for hash_row in file_ ] merged_data = functools.reduce(merge_hash_rows_on_hash_value, flattened_data + bank_data, {}).values() with metrics.timer(metrics.names.indexer.build_index): logger.info(f"Rebuilding {signal_type} Index") for index_class in INDEX_MAPPING[signal_type]: index: S3BackedInstrumentedIndexMixin = index_class.build( merged_data) logger.info( f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}" ) index.save(bucket_name=INDEXES_BUCKET_NAME) metrics.flush() logger.info("Index updates complete")
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-data20210224213427723700000003 - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must be a pdq file ending in ".pdq.te" Which means adding new versions of the datasets will not have an effect. You must add the exact pdq.te file. """ if not was_pdq_data_updated(event): logger.info("PDQ Data Not Updated, skipping") return logger.info("PDQ Data Updated, updating pdq hash index") metrics_logger = metrics.names.pdq_indexer_lambda s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION, ) pdq_data_files = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics_logger).load_data() with metrics.timer(metrics_logger.merge_datafiles): logger.info("Merging PDQ Hash files") flat_pdq_data = [ hash_row for pdq_file in pdq_data_files.values() for hash_row in pdq_file ] merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values() with metrics.timer(metrics_logger.build_index): logger.info("Creating PDQ Hash Index") index = PDQIndex.build(merged_pdq_data) logger.info("Putting index in S3") index_bytes = pickle.dumps(index) with metrics.timer(metrics_logger.upload_index): s3_client.put_object(Bucket=INDEXES_BUCKET_NAME, Key=PDQ_INDEX_KEY, Body=index_bytes) logger.info("Index update complete") metrics.flush()
def get_signal_hash_count() -> t.Dict[str, int]: s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION, ) pdq_storage = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics.names.api_hash_count()) pdq_data_files = pdq_storage.load_data() return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}
def _get_signal_hash_count_and_last_modified( threat_exchange_data_bucket_name: str, threat_exchange_data_folder: str, threat_exchange_pdq_file_extension: str, ) -> t.Dict[str, t.Tuple[int, str]]: # TODO this method is expensive some cache or memoization method might be a good idea. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=threat_exchange_data_bucket_name, threat_exchange_data_folder=threat_exchange_data_folder, threat_exchange_pdq_file_extension=threat_exchange_pdq_file_extension, ) pdq_storage = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics.names.api_hash_count()) pdq_data_files = pdq_storage.load_data() return { file_name: (len(rows), pdq_storage.last_modified[file_name]) for file_name, rows in pdq_data_files.items() }
def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-<...> - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must return a signal_type in ThreatUpdateS3Store.get_signal_type_from_object_key """ updates = get_updated_files_by_signal_type(event) logger.info(updates) if not updates: logger.info("Signal Data Not Updated, skipping") return logger.info( f"Received updates for indicator_types: {','.join(map(lambda x: str(x), updates.keys()))}" ) # Note: even though we know which files were updated, threatexchange indexes # do not yet allow adding new entries. So, we must do a full rebuild. So, we # only end up using the signal types that were updated, not the actual files # that changed. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension="PHASE_OUT_THIS_INTERNAL_DETAIL", ) for updated_signal_type in updates.keys(): adapter_class = _ADAPTER_MAPPING[updated_signal_type] data_files = adapter_class( config=s3_config, metrics_logger=metrics.names.indexer).load_data() with metrics.timer(metrics.names.indexer.merge_datafiles): logger.info(f"Merging {updated_signal_type} Hash files") flattened_data = [ hash_row for file_ in data_files.values() for hash_row in file_ ] merged_data = functools.reduce(merge_threat_exchange_files, flattened_data, {}).values() with metrics.timer(metrics.names.indexer.build_index): logger.info(f"Rebuilding {updated_signal_type} Index") index_class = INDEX_MAPPING[updated_signal_type] index: S3BackedInstrumentedIndexMixin = index_class.build( merged_data) logger.info(f"Putting {updated_signal_type} index in S3") index.save(bucket_name=INDEXES_BUCKET_NAME) metrics.flush() logger.info("Index updates complete")