def test_when_allFields_is_true_and_entities_not_empty_exception(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None)
def test_when_allFields_is_true_full_recognizers_list_return_all_fields( self): analyze_engine = AnalyzerEngine(RecognizerRegistry()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities
def test_when_allFields_is_true_return_all_fields(self): analyze_engine = AnalyzerEngine(MockRecognizerRegistry()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090 " \ "Domain: microsoft.com" response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "CREDIT_CARD" in returned_entities assert "PHONE_NUMBER" in returned_entities assert "DOMAIN_NAME" in returned_entities
class TestAnalyzerEngine(TestCase): def __init__(self, *args, **kwargs): super(TestAnalyzerEngine, self).__init__(*args, **kwargs) self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock()) mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") self.app_tracer = AppTracerMock(enable_interpretability=True) self.loaded_analyzer_engine = AnalyzerEngine( self.loaded_registry, MockNlpEngine(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts), app_tracer=self.app_tracer, enable_trace_pii=True) self.unit_test_guid = "00000000-0000-0000-0000-000000000000" def test_analyze_with_predefined_recognizers_return_results(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD"] results = self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 1 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) def test_analyze_with_multiple_predefined_recognizers(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can use the context words analyzer_engine_with_spacy = AnalyzerEngine( registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine) results = analyzer_engine_with_spacy.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 2 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \ PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score) def test_analyze_without_entities(self): with pytest.raises(ValueError): language = "en" text = " Credit card: 4095-2609-9393-4932, my name is John Oliver, DateTime: September 18 Domain: microsoft.com" entities = [] self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) def test_analyze_with_empty_text(self): language = "en" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] results = self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 0 def test_analyze_with_unsupported_language(self): with pytest.raises(ValueError): language = "de" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) def test_remove_duplicates(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)) ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 1 assert results[0].score == 0.5 # TODO: add more cases with bug: # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other def test_remove_duplicates_different_entity_no_removal(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)) ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 2 def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8) def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 def test_apply_with_language_returns_correct_response(self): request = AnalyzeRequest() request.analyzeTemplate.language = 'en' request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_apply_with_no_language_returns_default(self): request = AnalyzeRequest() request.analyzeTemplate.language = '' request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_when_allFields_is_true_return_all_fields(self): analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(), nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.analyzeTemplate.resultsScoreThreshold = 0 request.text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090 " \ "Domain: microsoft.com" response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "CREDIT_CARD" in returned_entities assert "PHONE_NUMBER" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_full_recognizers_list_return_all_fields( self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=loaded_spacy_nlp_engine) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_and_entities_not_empty_exception(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None) def test_when_analyze_then_apptracer_has_value(self): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"] analyzer_engine_with_spacy = AnalyzerEngine(self.loaded_registry, app_tracer=self.app_tracer, enable_trace_pii=True) results = analyzer_engine_with_spacy.analyze( correlation_id=self.unit_test_guid, text=text, entities=entities, language=language, all_fields=False, trace=True) assert len(results) == 3 for result in results: assert result.analysis_explanation is not None assert self.app_tracer.get_msg_counter() == 2 assert self.app_tracer.get_last_trace() is not None def test_when_threshold_is_zero_all_results_pass(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False, score_threshold=0) assert len(results) == 2 def test_when_threshold_is_more_than_half_only_credit_card_passes(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False, score_threshold=0.51) assert len(results) == 1 def test_when_default_threshold_is_more_than_half_only_one_passes(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine(), default_score_threshold=0.7) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 1 def test_when_default_threshold_is_zero_all_results_pass(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 2 def test_demo_text(self): text = "Here are a few examples of entities we currently support: \n" \ "Credit card: 4095-2609-9393-4932 \n" \ "Crypto wallet id: 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ \n" \ "DateTime: September 18 n" \ "Domain: microsoft.com \n" \ "Email address: [email protected] \n" \ "IBAN code: IL150120690000003111111 \n" \ "IP: 192.168.0.1 i\n" \ "Person name: David Johnson\n" \ "Bank account: 2854567876542\n" \ "Driver license number: H12234567\n" \ "Passport: 912803456\n" \ "Phone number: (212) 555-1234.\n" \ "Social security number: 078-05-1120\n" \ "" \ "This project welcomes contributions and suggestions. Most contributions require you to agree to a " \ "Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us " \ "the rights to use your contribution. For details, visit https://cla.microsoft.com.\n" \ "When you submit a pull request, a CLA-bot will automatically determine whether you need to provide " \ "a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions " \ "provided by the bot. You will only need to do this once across all repos using our CLA.\n\n" \ "This project has adopted the Microsoft Open Source Code of Conduct. For more information see the " \ "Code of Conduct FAQ or contact [email protected] with any additional questions or comments." language = "en" analyzer_engine = AnalyzerEngine(default_score_threshold=0.6) results = analyzer_engine.analyze(correlation_id=self.unit_test_guid, text=text, entities=None, language=language, all_fields=True) assert len(results) == 15
class TestAnalyzerEngine(TestCase): def __init__(self, *args, **kwargs): super(TestAnalyzerEngine, self).__init__(*args, **kwargs) self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock()) mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") self.loaded_analyzer_engine = AnalyzerEngine( self.loaded_registry, MockNlpEngine(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts)) def test_analyze_with_predefined_recognizers_return_results(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD"] results = self.loaded_analyzer_engine.analyze(text, entities, language, all_fields=False) assert len(results) == 1 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) def test_analyze_with_multiple_predefined_recognizers(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine_with_spacy = AnalyzerEngine(self.loaded_registry) results = analyzer_engine_with_spacy.analyze(text, entities, language, all_fields=False) assert len(results) == 2 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \ PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score) def test_analyze_without_entities(self): with pytest.raises(ValueError): language = "en" text = " Credit card: 4095-2609-9393-4932, my name is John Oliver, DateTime: September 18 Domain: microsoft.com" entities = [] self.loaded_analyzer_engine.analyze(text, entities, language, all_fields=False) def test_analyze_with_empty_text(self): language = "en" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] results = self.loaded_analyzer_engine.analyze(text, entities, language, all_fields=False) assert len(results) == 0 def test_analyze_with_unsupported_language(self): with pytest.raises(ValueError): language = "de" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] self.loaded_analyzer_engine.analyze(text, entities, language, all_fields=False) def test_remove_duplicates(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x"), RecognizerResult(start=0, end=5, score=0.5, entity_type="x") ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 1 assert results[0].score == 0.5 # TODO: add more cases with bug: # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( MockRecognizerRegistry(recognizers_store_api_mock)) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8) def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( MockRecognizerRegistry(recognizers_store_api_mock)) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 def test_apply_with_language_returns_correct_response(self): request = AnalyzeRequest() request.analyzeTemplate.language = 'en' new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_apply_with_no_language_returns_default(self): request = AnalyzeRequest() request.analyzeTemplate.language = '' new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_when_allFields_is_true_return_all_fields(self): analyze_engine = AnalyzerEngine(MockRecognizerRegistry()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090 " \ "Domain: microsoft.com" response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "CREDIT_CARD" in returned_entities assert "PHONE_NUMBER" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_full_recognizers_list_return_all_fields( self): analyze_engine = AnalyzerEngine(RecognizerRegistry()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_and_entities_not_empty_exception(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None)