def lambda_handler(event, context): """ This is the main entry point for writing back to ThreatExchange. The action evaluator sends a writeback message by way of the writebacks queue and here's where they're popped off and dealt with. """ HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"]) writebacks_performed = {} for sqs_record in event["Records"]: # TODO research max # sqs records / lambda_handler invocation writeback_message = WritebackMessage.from_aws_json(sqs_record["body"]) logger.info("Writing Back: %s", writeback_message) # get all sources that are related to this writeback sources = { banked_signal.bank_source for banked_signal in writeback_message.banked_signals } source_writebackers = [ Writebacker.get_writebacker_for_source(source) for source in sources if Writebacker.get_writebacker_for_source(source) ] for writebacker in source_writebackers: result = writebacker.perform_writeback(writeback_message) logger.info("Writeback result: %s", result) writebacks_performed[writebacker.source] = result return {"writebacks_performed": writebacks_performed}
def get_actions_api(hma_config_table: str) -> bottle.Bottle: # The documentation below expects prefix to be '/actions/' actions_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @actions_api.get("/", apply=[jsoninator]) def fetch_all_actions() -> FetchAllActionsResponse: """ Returns all action configs. """ action_configs = ActionPerformer.get_all() return FetchAllActionsResponse( actions_response=[config.__dict__ for config in action_configs]) @actions_api.put( "/<old_name>/<old_config_sub_stype>", apply=[jsoninator(CreateUpdateActionRequest)], ) def update_action(request: CreateUpdateActionRequest, old_name: str, old_config_sub_stype: str) -> UpdateActionResponse: """ Update an action url and headers """ if old_name != request.name or old_config_sub_stype != request.config_subtype: # The name field can't be updated because it is the primary key # The config sub type can't be updated because it is the config class level param delete_action(old_name) create_action(request) else: config = ActionPerformer._get_subtypes_by_name()[ request.config_subtype].getx(request.name) for key, value in request.fields.items(): setattr(config, key, value) hmaconfig.update_config(config) return UpdateActionResponse(response="The action config is updated.") @actions_api.post("/", apply=[jsoninator(CreateUpdateActionRequest)]) def create_action( request: CreateUpdateActionRequest) -> CreateActionResponse: """ create an action """ config = ActionPerformer._get_subtypes_by_name()[ request.config_subtype](**{ "name": request.name, **request.fields }) hmaconfig.create_config(config) return CreateActionResponse(response="The action config is created.") @actions_api.delete("/<name>", apply=[jsoninator]) def delete_action(name: str) -> DeleteActionResponse: """ Delete an action """ hmaconfig.delete_config_by_type_and_name("ActionPerformer", name) return DeleteActionResponse(response="The action config is deleted.") return actions_api
def get(cls): logger.info("Initializing configs using table name %s", os.environ["CONFIG_TABLE_NAME"]) HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"]) return cls( actions_queue_url=os.environ["ACTIONS_QUEUE_URL"], sqs_client=boto3.client("sqs"), )
def get_matches_api( dynamodb_table: Table, hma_config_table: str, indexes_bucket_name: str, writeback_queue_url: str, ) -> bottle.Bottle: """ A Closure that includes all dependencies that MUST be provided by the root API that this API plugs into. Declare dependencies here, but initialize in the root API alone. """ # A prefix to all routes must be provided by the api_root app # The documentation below expects prefix to be '/matches/' matches_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @matches_api.get("/", apply=[jsoninator]) def matches() -> MatchSummariesResponse: """ Return all, or a filtered list of matches based on query params. """ signal_q = bottle.request.query.signal_q or None signal_source = bottle.request.query.signal_source or None content_q = bottle.request.query.content_q or None if content_q: records = MatchRecord.get_from_content_id(dynamodb_table, content_q) elif signal_q: records = MatchRecord.get_from_signal(dynamodb_table, signal_q, signal_source or "") else: # TODO: Support pagination after implementing in UI. records = MatchRecord.get_recent_items_page(dynamodb_table).items return MatchSummariesResponse(match_summaries=[ MatchSummary( content_id=record.content_id, signal_id=record.signal_id, signal_source=record.signal_source, updated_at=record.updated_at.isoformat(), ) for record in records ]) @matches_api.get("/match/", apply=[jsoninator]) def match_details() -> MatchDetailsResponse: """ Return the match details for a given content id. """ results = [] if content_id := bottle.request.query.content_id or None: results = get_match_details(dynamodb_table, content_id) return MatchDetailsResponse(match_details=results)
def get_matches_api(dynamodb_table: Table, hma_config_table: str) -> bottle.Bottle: """ A Closure that includes all dependencies that MUST be provided by the root API that this API plugs into. Declare dependencies here, but initialize in the root API alone. """ # A prefix to all routes must be provided by the api_root app # The documentation below expects prefix to be '/matches/' matches_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @matches_api.get("/", apply=[jsoninator]) def matches() -> MatchSummariesResponse: """ Returns all, or a filtered list of matches. """ signal_q = bottle.request.query.signal_q or None signal_source = bottle.request.query.signal_source or None content_q = bottle.request.query.content_q or None if content_q: records = PDQMatchRecord.get_from_content_id( dynamodb_table, content_q) elif signal_q: records = PDQMatchRecord.get_from_signal(dynamodb_table, signal_q, signal_source or "") else: records = PDQMatchRecord.get_from_time_range(dynamodb_table) return MatchSummariesResponse(match_summaries=[ MatchSummary( content_id=record.content_id, signal_id=record.signal_id, signal_source=record.signal_source, updated_at=record.updated_at.isoformat(), ) for record in records ]) @matches_api.get("/match/", apply=[jsoninator]) def match_details() -> MatchDetailsResponse: """ match details API endpoint: return format: match_details : [MatchDetailsResult] """ results = [] if content_id := bottle.request.query.content_id or None: results = get_match_details(dynamodb_table, content_id) return MatchDetailsResponse(match_details=results)
def lambda_init_once(): """ Do some late initialization for required lambda components. Lambda initialization is weird - despite the existence of perfectly good constructions like __name__ == __main__, there don't appear to be easy ways to split your lambda-specific logic from your module logic except by splitting up the files and making your lambda entry as small as possible. TODO: Just refactor this file to separate the lambda and functional components """ cfg = FetcherConfig.get() HMAConfig.initialize(cfg.config_table_name)
def get(cls): logger.info("Initializing configs using table name %s", os.environ["CONFIG_TABLE_NAME"]) logger.info( "Initializing dynamo table using table name %s", os.environ["DYNAMODB_TABLE"], ) HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"]) dynamo_db_table_name = os.environ["DYNAMODB_TABLE"] dynamodb: DynamoDBServiceResource = boto3.resource("dynamodb") writeback_queue_url = os.environ["WRITEBACKS_QUEUE_URL"] return cls( actions_queue_url=os.environ["ACTIONS_QUEUE_URL"], sqs_client=boto3.client("sqs"), dynamo_db_table=dynamodb.Table(dynamo_db_table_name), writeback_queue_url=writeback_queue_url, )
def _create_privacy_groups(self): # Since we already have a mock_dynamodb2 courtesy BanksTableTestBase, # re-use it for initing configs. Requires some clever hot-wiring. config_test_mock = config_test.ConfigTest() config_test_mock.mock_dynamodb2 = self.__class__.mock_dynamodb2 config_test_mock.create_mocked_table() HMAConfig.initialize(config_test_mock.TABLE_NAME) # Hot wiring ends... self.active_pg = ThreatExchangeConfig( "ACTIVE_PG", True, "", True, True, True, "ACTIVE_PG" ) create_config(self.active_pg) # Active PG has a distance threshold of 31. create_config(AdditionalMatchSettingsConfig("ACTIVE_PG", 31)) self.inactive_pg = ThreatExchangeConfig( "INACTIVE_PG", True, "", True, True, False, "INACTIVE_PG" ) create_config(self.inactive_pg)
def get_datasets_api(hma_config_table: str) -> bottle.Bottle: # The documentation below expects prefix to be '/datasets/' datasets_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @datasets_api.get("/", apply=[jsoninator]) def datasets() -> DatasetsResponse: """ Returns all datasets. """ collabs = ThreatExchangeConfig.get_all() return DatasetsResponse(datasets_response=[ Dataset.from_collab(collab) for collab in collabs ]) @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)]) def update_dataset(request: UpdateDatasetRequest) -> Dataset: """ Update dataset fetcher_active and write_back """ config = ThreatExchangeConfig.getx(str(request.privacy_group_id)) config.fetcher_active = request.fetcher_active config.write_back = request.write_back updated_config = hmaconfig.update_config(config).__dict__ updated_config["privacy_group_id"] = updated_config["name"] return Dataset.from_dict(updated_config) @datasets_api.post("/sync", apply=[jsoninator]) def sync_datasets() -> SyncDatasetResponse: """ Fetch new collaborations from ThreatExchnage and potentially update the configs stored in AWS """ sync_privacy_groups() return SyncDatasetResponse(response="Dataset is update-to-date") return datasets_api
def get_action_rules_api(hma_config_table: str) -> bottle.Bottle: # The endpoints below imply a prefix of '/action-rules' action_rules_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @action_rules_api.get("/", apply=[jsoninator]) def get_action_rules() -> ActionRulesResponse: """ Return all action rules. """ error_message = "" action_rules = [] try: action_rules = ActionRule.get_all() logger.info("action_rules: %s", action_rules) except Exception as e: error_message = "Unexpected error." handle_unexpected_error(e) return ActionRulesResponse(error_message, action_rules) @action_rules_api.post("/", apply=[jsoninator(ActionRulesRequest)]) def create_action_rule( request: ActionRulesRequest, ) -> ActionRulesResponse: """ Create an action rule. """ logger.info("request: %s", request) error_message = "" try: hmaconfig.create_config(request.action_rule) except ClientError as e: # TODO this test for "already exists" should be moved to a common place if e.response["Error"]["Code"] == "ConditionalCheckFailedException": error_message = f"An action rule with the name '{request.action_rule.name}' already exists." logger.warning( "Duplicate action rule creation attempted: %s", e.response["Error"]["Message"], ) else: error_message = "Unexpected error." logger.error( "Unexpected client error: %s", e.response["Error"]["Message"] ) logger.exception(e) response.status = 500 except Exception as e: error_message = "Unexpected error." handle_unexpected_error(e) return ActionRulesResponse(error_message) @action_rules_api.put("/<old_name>", apply=[jsoninator(ActionRulesRequest)]) def update_action_rule( request: ActionRulesRequest, old_name: str, ) -> ActionRulesResponse: """ Update the action rule with name=<oldname>. """ logger.info("old_name: %s", old_name) logger.info("request: %s", request) error_message = "" if ActionRule.exists(request.action_rule.name): try: hmaconfig.update_config(request.action_rule) except Exception as e: error_message = "Unexpected error." handle_unexpected_error(e) elif ActionRule.exists(old_name): try: hmaconfig.create_config(request.action_rule) hmaconfig.delete_config_by_type_and_name("ActionRule", old_name) except Exception as e: error_message = "Unexpected error." handle_unexpected_error(e) else: error_message = f"An action rule named '{request.action_rule.name}' or '{old_name}' does not exist." logger.warning( "An attempt was made to update an action rule named either '%s' or '%s' but neither exist.", request.action_rule.name, old_name, ) response.status = 500 return ActionRulesResponse(error_message) @action_rules_api.delete("/<name>", apply=[jsoninator]) def delete_action_rule(name: str) -> ActionRulesResponse: """ Delete the action rule with name=<name>. """ logger.info("name: %s", name) error_message = "" if ActionRule.exists(name): try: hmaconfig.delete_config_by_type_and_name("ActionRule", name) except Exception as e: error_message = "Unexpected error." handle_unexpected_error(e) else: error_message = f"An action rule named '{name}' does not exist." logger.warning( "An attempt was made to delete an action rule named '%s' that does not exist.", name, ) response.status = 500 return ActionRulesResponse(error_message) return action_rules_api
def get_datasets_api( hma_config_table: str, datastore_table: Table, threat_exchange_data_bucket_name: str, threat_exchange_data_folder: str, threat_exchange_pdq_file_extension: str, ) -> bottle.Bottle: # The documentation below expects prefix to be '/datasets/' datasets_api = bottle.Bottle() HMAConfig.initialize(hma_config_table) @datasets_api.get("/", apply=[jsoninator]) def get_all_dataset_summaries() -> DatasetSummariesResponse: """ Returns summaries for all datasets. Summary includes all facts that are not configurable. Eg. its name, the number of hashes it has, the number of matches it has caused, etc. """ return DatasetSummariesResponse( threat_exchange_datasets=_get_threat_exchange_datasets( datastore_table, threat_exchange_data_bucket_name, threat_exchange_data_folder, threat_exchange_pdq_file_extension, ), test_datasets=[], ) @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)]) def update_dataset(request: UpdateDatasetRequest) -> Dataset: """ Update dataset fetcher_active, write_back and matcher_active """ config = ThreatExchangeConfig.getx(str(request.privacy_group_id)) config.fetcher_active = request.fetcher_active config.write_back = request.write_back config.matcher_active = request.matcher_active updated_config = hmaconfig.update_config(config).__dict__ updated_config["privacy_group_id"] = updated_config["name"] return Dataset.from_dict(updated_config) @datasets_api.post("/create", apply=[jsoninator(CreateDatasetRequest)]) def create_dataset(request: CreateDatasetRequest) -> CreateDatasetResponse: """ Create a local dataset (defaults defined in CreateDatasetRequest) """ assert isinstance(request, CreateDatasetRequest) create_privacy_group_if_not_exists( privacy_group_id=str(request.privacy_group_id), privacy_group_name=request.privacy_group_name, description=request.description, in_use=True, fetcher_active=request.fetcher_active, matcher_active=request.matcher_active, write_back=request.write_back, ) return CreateDatasetResponse( response=f"Created dataset {request.privacy_group_id}" ) @datasets_api.post("/sync", apply=[jsoninator]) def sync_datasets() -> SyncDatasetResponse: """ Fetch new collaborations from ThreatExchnage and potentially update the configs stored in AWS """ sync_privacy_groups() return SyncDatasetResponse(response="Privacy groups are up to date") @datasets_api.post("/delete/<key>", apply=[jsoninator]) def delete_dataset(key=None) -> DeleteDatasetResponse: """ Delete dataset """ config = ThreatExchangeConfig.getx(str(key)) hmaconfig.delete_config(config) return DeleteDatasetResponse(response="The privacy group is deleted") return datasets_api
def remove_superseded_actions( action_label_to_action_rules: t.Dict[ActionLabel, t.List[ActionRule]], ) -> t.Dict[ActionLabel, t.List[ActionRule]]: """ TODO implement Evaluates a dictionary of action labels and the associated action rules generated for a match message against the actions. Action labels that are superseded by another will be removed. """ return action_label_to_action_rules if __name__ == "__main__": # For basic debugging HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"]) action_rules = get_action_rules() match_message = MatchMessage( content_key="m2", content_hash= "361da9e6cf1b72f5cea0344e5bb6e70939f4c70328ace762529cac704297354a", matching_banked_signals=[ BankedSignal( banked_content_id="3070359009741438", bank_id="258601789084078", bank_source="te", classifications={ BankedContentIDClassificationLabel( value="258601789084078"), ClassificationLabel(value="true_positive"), BankSourceClassificationLabel(value="te"),
logger = get_logger(__name__) s3_client = boto3.client("s3") sns_client: SNSClient = boto3.client("sns") dynamodb = boto3.resource("dynamodb") CACHED_TIME = 300 THRESHOLD = 31 LOCAL_INDEX_FILENAME = "/tmp/hashes.index" INDEXES_BUCKET_NAME = os.environ["INDEXES_BUCKET_NAME"] PDQ_INDEX_KEY = os.environ["PDQ_INDEX_KEY"] OUTPUT_TOPIC_ARN = os.environ["PDQ_MATCHES_TOPIC_ARN"] DYNAMODB_TABLE = os.environ["DYNAMODB_TABLE"] HMA_CONFIG_TABLE = os.environ["HMA_CONFIG_TABLE"] HMAConfig.initialize(HMA_CONFIG_TABLE) @lru_cache(maxsize=None) def get_index(bucket_name, key): """ Load the given index from the s3 bucket and deserialize it """ # TODO Cache this index for a period of time to reduce S3 calls and bandwidth. with metrics.timer(metrics.names.pdq_matcher_lambda.download_index): with open(LOCAL_INDEX_FILENAME, "wb") as index_file: s3_client.download_fileobj(bucket_name, key, index_file) with metrics.timer(metrics.names.pdq_matcher_lambda.parse_index): result = pickle.load(open(LOCAL_INDEX_FILENAME, "rb"))
def __init__(self, migration: str, config_table: str): self.migration = migration HMAConfig.initialize(config_table)
def bottle_init_once() -> t.Tuple[bottle.AppStack, t.Callable[ [t.Dict[str, t.Any], t.Any], t.Dict[str, t.Any]]]: """ Meant to be called once per lambda instance. Returns a bottle app and an api_wsgi_handler that can be plugged into a lambda handler. The method also serves as a closure for all dependencies that need to be resolved at startup. """ app = bottle.default_app() # Initialize hmaconfig at module level. Mounted SubApps need not initialize # their own HMAConfigs. HMAConfig.initialize(HMA_CONFIG_TABLE) functionality_mapping = get_pytx_functionality_mapping() @app.get("/root/") def root(): """ root endpoint to make sure the API is live and check when it was last updated """ context = bottle.request.environ.get("apig_wsgi.context") invoked_function_arn = context.invoked_function_arn client = boto3.client("lambda") last_modified = client.get_function_configuration( FunctionName=invoked_function_arn)["LastModified"] return { "message": "Welcome to the HMA API!", "last_modified": last_modified, } app.mount( "/action-rules/", get_action_rules_api(hma_config_table=HMA_CONFIG_TABLE), ) app.mount( "/matches/", get_matches_api( datastore_table=dynamodb.Table(DYNAMODB_TABLE), hma_config_table=HMA_CONFIG_TABLE, indexes_bucket_name=INDEXES_BUCKET_NAME, writeback_queue_url=WRITEBACK_QUEUE_URL, bank_table=dynamodb.Table(BANKS_TABLE), signal_type_mapping=functionality_mapping.signal_and_content, ), ) app.mount( "/content/", get_content_api( dynamodb_table=dynamodb.Table(DYNAMODB_TABLE), image_bucket=IMAGE_BUCKET_NAME, image_prefix=IMAGE_PREFIX, signal_type_mapping=functionality_mapping.signal_and_content, ), ) app.mount( "/submit/", get_submit_api( dynamodb_table=dynamodb.Table(DYNAMODB_TABLE), image_bucket=IMAGE_BUCKET_NAME, image_prefix=IMAGE_PREFIX, submissions_queue_url=SUBMISSIONS_QUEUE_URL, hash_queue_url=HASHES_QUEUE_URL, signal_type_mapping=functionality_mapping.signal_and_content, ), ) app.mount( "/datasets/", get_datasets_api( hma_config_table=HMA_CONFIG_TABLE, datastore_table=dynamodb.Table(DYNAMODB_TABLE), threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, ), ) app.mount("/stats/", get_stats_api(counts_table=dynamodb.Table(COUNTS_TABLE_NAME))) app.mount( "/actions/", get_actions_api(hma_config_table=HMA_CONFIG_TABLE), ) app.mount( "/banks/", get_bank_api( bank_table=dynamodb.Table(BANKS_TABLE), bank_user_media_bucket=BANKS_MEDIA_BUCKET_NAME, submissions_queue_url=SUBMISSIONS_QUEUE_URL, signal_type_mapping=functionality_mapping.signal_and_content, ), ) app.mount( "/indexes/", get_indexes_api( indexes_bucket_name=INDEXES_BUCKET_NAME, indexer_function_name=INDEXER_FUNCTION_NAME, ), ) app.mount( "/lcc/", get_lcc_api( storage_path=LCC_DURABLE_FS_PATH, signal_type_mapping=functionality_mapping.signal_and_content, ), ) apig_wsgi_handler = make_lambda_handler(app) return (app, apig_wsgi_handler)
def lambda_handler(event, context): """ Runs on a schedule. On each run, gets all data files for ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-<...> - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must return a signal_type in ThreatUpdateS3Store.get_signal_type_from_object_key """ # Note: even though we know which files were updated, threatexchange indexes # do not yet allow adding new entries. So, we must do a full rebuild. So, we # only end up using the signal types that were updated, not the actual files # that changed. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, ) banks_table = BanksTable(dynamodb.Table(BANKS_TABLE)) HMAConfig.initialize(HMA_CONFIG_TABLE) signal_content_mapping = get_pytx_functionality_mapping() for signal_type in ALL_INDEXABLE_SIGNAL_TYPES: adapter_class = _ADAPTER_MAPPING[signal_type] data_files = adapter_class( config=s3_config, metrics_logger=metrics.names.indexer).load_data() with metrics.timer(metrics.names.indexer.get_bank_data): bank_data = get_all_bank_hash_rows(signal_type, banks_table) with metrics.timer(metrics.names.indexer.merge_datafiles): logger.info(f"Merging {signal_type} Hash files") # go from dict[filename, list<hash rows>] → list<hash rows> flattened_data = [ hash_row for file_ in data_files.values() for hash_row in file_ ] merged_data = functools.reduce(merge_hash_rows_on_hash_value, flattened_data + bank_data, {}).values() with metrics.timer(metrics.names.indexer.build_index): logger.info(f"Rebuilding {signal_type} Index") for index_class in [ signal_type.get_index_cls() for signal_type in signal_content_mapping. signal_and_content.signal_type_by_name.values() ]: index: S3BackedInstrumentedIndexMixin = index_class.build( merged_data) logger.info( f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}" ) index.save(bucket_name=INDEXES_BUCKET_NAME) metrics.flush() logger.info("Index updates complete")
def lambda_handler(event, context): """ SQS Events generated by the submissions API or by files being added to S3. Downloads files to temp-storage, identifies content_type and generates allowed signal_types from it. Saves hash output to DynamoDB, sends a message on an output queue. Note that this brings the contents of a file into memory. This is subject to the resource limitation on the lambda. Potentially extendable until 10GB, but that would be super-expensive. [1] [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html """ records_table = get_dynamodb().Table(DYNAMODB_TABLE) HMAConfig.initialize(HMA_CONFIG_TABLE) banks_table = BanksTable( get_dynamodb().Table(BANKS_TABLE), _get_signal_type_mapping(), ) sqs_client = get_sqs_client() hasher = _get_hasher(_get_signal_type_mapping()) for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "s3:TestEvent": continue media_to_process: t.List[t.Union[S3ImageSubmission, URLSubmissionMessage, BankSubmissionMessage]] = [] if URLSubmissionMessage.could_be(message): media_to_process.append( URLSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) elif S3ImageSubmissionBatchMessage.could_be(message): # S3 submissions can only be images for now. media_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_PREFIX).image_submissions) elif BankSubmissionMessage.could_be(message): media_to_process.append( BankSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) else: logger.warn(f"Unprocessable Message: {message}") for media in media_to_process: if not hasher.supports(media.content_type): if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.warn( f"Unprocessable content type: {media.content_type}, id: {object_id}" ) continue with metrics.timer(metrics.names.hasher.download_file): try: if hasattr(media, "key") and hasattr(media, "bucket"): # Classic duck-typing. If it has key and bucket, must be an # S3 submission. media = t.cast(S3ImageSubmission, media) bytes_: bytes = S3BucketContentSource( media.bucket, IMAGE_PREFIX).get_bytes(media.content_id) else: media = t.cast(URLSubmissionMessage, media) bytes_: bytes = URLContentSource().get_bytes(media.url) except Exception: if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.exception( f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content." ) continue for signal in hasher.get_hashes(media.content_type, bytes_): if isinstance(media, BankSubmissionMessage): # route signals to bank datastore only. bank_operations.add_bank_member_signal( banks_table=banks_table, bank_id=media.bank_id, bank_member_id=media.bank_member_id, signal_type=signal.signal_type, signal_value=signal.signal_value, ) # don't write hash records etc. continue hash_record = PipelineHashRecord( content_id=media.content_id, signal_type=signal.signal_type, content_hash=signal.signal_value, updated_at=datetime.datetime.now(), ) hasher.write_hash_record(records_table, hash_record) hasher.publish_hash_message(sqs_client, hash_record) metrics.flush()
def get_datasets_api( hma_config_table: str, datastore_table: Table, threat_exchange_data_bucket_name: str, threat_exchange_data_folder: str, ) -> bottle.Bottle: """ ToDo / FixMe: this file is probably more about privacy groups than datasets... """ # The documentation below expects prefix to be '/datasets/' datasets_api = SubApp() HMAConfig.initialize(hma_config_table) @datasets_api.get("/", apply=[jsoninator]) def get_all_dataset_summaries() -> DatasetSummariesResponse: """ Returns summaries for all datasets. Summary includes all facts that are not configurable. Eg. its name, the number of hashes it has, the number of matches it has caused, etc. """ return DatasetSummariesResponse( threat_exchange_datasets=_get_threat_exchange_datasets( datastore_table, threat_exchange_data_bucket_name, threat_exchange_data_folder, )) @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)]) def update_dataset(request: UpdateDatasetRequest) -> Dataset: """ Update dataset values: fetcher_active, write_back, and matcher_active. """ config = ThreatExchangeConfig.getx(str(request.privacy_group_id)) config.fetcher_active = request.fetcher_active config.write_back = request.write_back config.matcher_active = request.matcher_active updated_config = hmaconfig.update_config(config).__dict__ updated_config["privacy_group_id"] = updated_config["name"] additional_config = AdditionalMatchSettingsConfig.get( str(request.privacy_group_id)) if request.pdq_match_threshold: if additional_config: additional_config.pdq_match_threshold = int( request.pdq_match_threshold) hmaconfig.update_config(additional_config) else: additional_config = AdditionalMatchSettingsConfig( str(request.privacy_group_id), int(request.pdq_match_threshold)) hmaconfig.create_config(additional_config) elif additional_config: # pdq_match_threshold was set and now should be removed hmaconfig.delete_config(additional_config) return Dataset.from_dict(updated_config) @datasets_api.post("/create", apply=[jsoninator(CreateDatasetRequest)]) def create_dataset(request: CreateDatasetRequest) -> CreateDatasetResponse: """ Create a local dataset (defaults defined in CreateDatasetRequest) """ assert isinstance(request, CreateDatasetRequest) create_privacy_group_if_not_exists( privacy_group_id=str(request.privacy_group_id), privacy_group_name=request.privacy_group_name, description=request.description, in_use=True, fetcher_active=request.fetcher_active, matcher_active=request.matcher_active, write_back=request.write_back, ) return CreateDatasetResponse( response=f"Created dataset {request.privacy_group_id}") @datasets_api.post("/sync", apply=[jsoninator]) def sync_datasets() -> SyncDatasetResponse: """ Fetch new collaborations from ThreatExchange and sync with the configs stored in DynamoDB. """ sync_privacy_groups() return SyncDatasetResponse(response="Privacy groups are up to date") @datasets_api.post("/delete/<key>", apply=[jsoninator]) def delete_dataset(key=None) -> DeleteDatasetResponse: """ Delete the dataset with key=<key> """ config = ThreatExchangeConfig.getx(str(key)) hmaconfig.delete_config(config) return DeleteDatasetResponse(response="The privacy group is deleted") @datasets_api.get("/match-settings", apply=[jsoninator]) def get_all_match_settings() -> MatchSettingsResponse: """ Return all match settings configs """ return MatchSettingsResponse(match_settings=[ MatchSettingsResponseBody(c) for c in AdditionalMatchSettingsConfig.get_all() ]) @datasets_api.get("/match-settings/<key>", apply=[jsoninator]) def get_match_settings(key=None, ) -> MatchSettingsResponseBody: """ Return a match settings config for a given privacy_group_id """ if config := AdditionalMatchSettingsConfig.get(str(key)): return MatchSettingsResponseBody(config) return bottle.abort(400, f"No match_settings for pg_id {key} found")