def test_remove_pattern_recognizer(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Expects one custom recognizer recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0
def __init__(self, registry=RecognizerRegistry(), nlp_engine=SpacyNlpEngine()): # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers registry.load_predefined_recognizers()
def __init__(self, registry=None, nlp_engine=None, app_tracer=None, enable_trace_pii=False, default_score_threshold=None): """ AnalyzerEngine class: Orchestrating the detection of PII entities and all related logic :param registry: instance of type RecognizerRegistry :param nlp_engine: instance of type NlpEngine (for example SpacyNlpEngine) :param app_tracer: instance of type AppTracer, used to trace the logic used during each request :param enable_trace_pii: bool, defines whether PII values should be traced or not. :param default_score_threshold: Minimum confidence value for detected entities to be returned """ if not nlp_engine: from analyzer.nlp_engine import SpacyNlpEngine nlp_engine = SpacyNlpEngine() if not registry: from analyzer import RecognizerRegistry registry = RecognizerRegistry() if not app_tracer: app_tracer = AppTracer() # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers() self.app_tracer = app_tracer self.enable_trace_pii = enable_trace_pii if default_score_threshold is None: self.default_score_threshold = 0 else: self.default_score_threshold = default_score_threshold
def test_add_pattern_recognizer(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 assert recognizers[0].patterns[0].name == "rocket pattern" assert recognizers[0].name == "Rocket recognizer"
def test_when_allFields_is_true_and_entities_not_empty_exception(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None)
def test_when_allFields_is_true_full_recognizers_list_return_all_fields( self): analyze_engine = AnalyzerEngine(RecognizerRegistry()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities
def test_cache_logic(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Negative flow recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) custom_recognizers = recognizer_registry.get_custom_recognizers() # Nothing should be returned assert len(custom_recognizers) == 0 # Since no hash was returned, then no access to storage is expected assert recognizers_store_api_mock.times_accessed_storage == 0 # Add a new recognizer recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=True) # Since the hash wasn't updated the recognizers are stale from the cache # without the newly added one custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 0 # And we also didn't accessed the underlying storage assert recognizers_store_api_mock.times_accessed_storage == 0 # Positive flow # Now do the same only this time update the hash so it should work properly recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizer_registry.get_custom_recognizers() assert recognizers_store_api_mock.times_accessed_storage == 0 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=False) custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 1 # Accessed again assert recognizers_store_api_mock.times_accessed_storage == 1
def get_mock_recognizer_registry(self): pattern_recognizer1 = self.get_mock_pattern_recognizer( "en", "PERSON", "1") pattern_recognizer2 = self.get_mock_pattern_recognizer( "de", "PERSON", "2") pattern_recognizer3 = self.get_mock_pattern_recognizer( "de", "ADDRESS", "3") pattern_recognizer4 = self.get_mock_pattern_recognizer( "he", "ADDRESS", "4") pattern_recognizer5 = self.get_mock_custom_recognizer( "he", ["PERSON", "ADDRESS"], "5") recognizers_store_api_mock = RecognizerStoreApiMock() return RecognizerRegistry(recognizers_store_api_mock, [ pattern_recognizer1, pattern_recognizer2, pattern_recognizer3, pattern_recognizer4, pattern_recognizer5 ])