def test_get_recognizers_returns_added_custom(): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=NlpEngineMock(), ) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 14 predefined recognizers assert len(response) == 14 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) response = analyze_engine.GetAllRecognizers(request, None) # there are 14 predefined recognizers and one custom assert len(response) == 15
def test_when_analyze_with_multiple_predefined_recognizers_then_succeed( loaded_registry, unit_test_guid, nlp_engine, max_score): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] analyzer_engine_with_spacy = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language=language, ) assert len(results) == 2 medium_regex_score = 0.5 # see UsPhoneRecognizer.PATTERNS context_similarity_factor = 0.35 # PatternRecognizer.CONTEXT_SIMILARITY_FACTOR assert_result(results[0], "CREDIT_CARD", 14, 33, max_score) expected_score = medium_regex_score + context_similarity_factor assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
def test_when_threshold_is_zero_all_results_pass(loaded_registry, unit_test_guid): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=loaded_registry, nlp_engine=NlpEngineMock()) results = analyzer_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language=language, all_fields=False, score_threshold=0, ) assert len(results) == 2
def test_added_pattern_recognizer_works(unit_test_guid): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=NlpEngineMock(), ) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", all_fields=False, ) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", all_fields=False, ) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def test_analyze_with_multiple_predefined_recognizers(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can use the context words analyzer_engine_with_spacy = AnalyzerEngine( registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine) results = analyzer_engine_with_spacy.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 2 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \ PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
def test_get_recognizers_returns_custom(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers and one custom assert len(response) == 16 rocket_recognizer = [ recognizer for recognizer in response if recognizer.name == "Rocket recognizer" and recognizer.entities == ["ROCKET"] and recognizer.language == "en" ] assert len(rocket_recognizer) == 1
def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry( recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer( "Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0
def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) mock_recognizer_registry = RecognizerRegistryMock() # Make sure the analyzer doesn't get this entity analyze_engine = AnalyzerEngine( registry=mock_recognizer_registry, nlp_engine=NlpEngineMock(), ) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", ) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) mock_recognizer_registry.add_recognizer(pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", ) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def test_when_analyze_then_apptracer_has_value(loaded_registry, unit_test_guid, nlp_engine): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" # noqa E501 language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"] app_tracer_mock = AppTracerMock(enable_decision_process=True) analyzer_engine_with_spacy = AnalyzerEngine( loaded_registry, app_tracer=app_tracer_mock, log_decision_process=True, nlp_engine=nlp_engine, ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language=language, return_decision_process=True, ) assert len(results) == 3 for result in results: assert result.analysis_explanation is not None assert app_tracer_mock.get_msg_counter() == 2 assert app_tracer_mock.get_last_trace() is not None
class PDM: def __init__(self, language='en'): self.analyzer = AnalyzerEngine() self.anonymizer = AnonymizerEngine() self.language = language def predict(self, text, entities_of_interest=ENTITIES_OF_INTEREST): t0 = time() analyzer_results = self.analyzer.analyze(text, entities=entities_of_interest, language=self.language) t1 = time() anonymized_results = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results) t2 = time() results = {'time_to_analyze': f'{t1-t0:.4f} seconds', 'time_to_anonymize': f'{t2-t1:.4f} seconds', 'anonymized_text': anonymized_results.text, 'detected_items': [{'start': item.start, 'end': item.end, 'entity_type': item.entity_type} for item in anonymized_results.items]} return results
def test_remove_duplicates_different_entity_no_removal(self): # test same result with different score will return only the highest arr = [RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None))] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 2
def __init__( self, analyzer_engine=AnalyzerEngine(), entities_to_keep: List[str] = None, verbose: bool = False, labeling_scheme="BIO", score_threshold=0.4, ): """ Evaluation wrapper for the Presidio Analyzer :param analyzer_engine: object of type AnalyzerEngine (from presidio-analyzer) """ super().__init__( entities_to_keep=entities_to_keep, verbose=verbose, labeling_scheme=labeling_scheme, ) self.analyzer_engine = analyzer_engine self.score_threshold = score_threshold
def serve_command_handler(enable_trace_pii, env_grpc_port=False, grpc_port=3000): logger.info("Starting GRPC server") server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) logger.info("GRPC started") logger.info("Creating RecognizerRegistry") registry = RecognizerRegistry() logger.info("RecognizerRegistry created") logger.info("Creating SpacyNlpEngine") nlp_engine = SpacyNlpEngine() logger.info("SpacyNlpEngine created") analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server( AnalyzerEngine(registry=registry, nlp_engine=nlp_engine, enable_trace_pii=enable_trace_pii, use_recognizer_store=True), server) logger.info("Added AnalyzeServiceServicer to server") if env_grpc_port: logger.info("Getting port {}".format(env_grpc_port)) port = os.environ.get('GRPC_PORT') if port is not None or port != '': grpc_port = int(port) else: logger.info("env_grpc_port not provided. " "Using grpc_port {}".format(grpc_port)) server.add_insecure_port('[::]:' + str(grpc_port)) logger.info("Starting GRPC listener at port {}".format(grpc_port)) server.start() try: while True: time.sleep(1) except KeyboardInterrupt: server.stop(0)
else: emails_dict = {} if os.path.isfile(names_dict_pkl_path): names_dict = pickle.load( open(names_dict_pkl_path, "rb") ) else: names_dict = {} # Define locale and language dictionaries faker_locales_dict = {'UNITED STATES': 'en_US', 'ITALY': 'it_IT', 'GERMANY': 'de_DE'} # Initialize Presidio's analyzer and anonymizer # https://microsoft.github.io/presidio/supported_entities/ analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() # Create a copy of the source dataset df = dataset.copy() # Apply the function anonymizeName for each value of the Name column df.Name = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Name'], df['Country'])] ) # Apply the function anonymizeEmail for each value of the Email column df.Email = pd.Series( [anonymizeEmail(text, country) for (text, country) in zip(df['Email'], df['Country'])] ) # Column Notes is 'object' data type as it contains lot of NaN and # Pandas doesn't recognize it as string. So it has to be cast to string # in order to be anonymized. Then replace it with its anonymization df.Notes = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])] )
class TestAnalyzerEngine(TestCase): def __init__(self, *args, **kwargs): super(TestAnalyzerEngine, self).__init__(*args, **kwargs) self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock()) mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") self.app_tracer = AppTracerMock(enable_interpretability=True) self.loaded_analyzer_engine = AnalyzerEngine( self.loaded_registry, MockNlpEngine(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts), app_tracer=self.app_tracer, enable_trace_pii=True) self.unit_test_guid = "00000000-0000-0000-0000-000000000000" def test_analyze_with_predefined_recognizers_return_results(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD"] results = self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 1 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) def test_analyze_with_multiple_predefined_recognizers(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can use the context words analyzer_engine_with_spacy = AnalyzerEngine( registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine) results = analyzer_engine_with_spacy.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 2 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \ PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score) def test_analyze_without_entities(self): with pytest.raises(ValueError): language = "en" text = " Credit card: 4095-2609-9393-4932, my name is John Oliver, DateTime: September 18 Domain: microsoft.com" entities = [] self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) def test_analyze_with_empty_text(self): language = "en" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] results = self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 0 def test_analyze_with_unsupported_language(self): with pytest.raises(ValueError): language = "de" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] self.loaded_analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) def test_remove_duplicates(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)) ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 1 assert results[0].score == 0.5 # TODO: add more cases with bug: # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other def test_remove_duplicates_different_entity_no_removal(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)) ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 2 def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8) def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 def test_apply_with_language_returns_correct_response(self): request = AnalyzeRequest() request.analyzeTemplate.language = 'en' request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_apply_with_no_language_returns_default(self): request = AnalyzeRequest() request.analyzeTemplate.language = '' request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' request.text = "My credit card number is 4916994465041084" response = self.loaded_analyzer_engine.Apply(request, None) assert response.analyzeResults is not None def test_when_allFields_is_true_return_all_fields(self): analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(), nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.analyzeTemplate.resultsScoreThreshold = 0 request.text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090 " \ "Domain: microsoft.com" response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "CREDIT_CARD" in returned_entities assert "PHONE_NUMBER" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_full_recognizers_list_return_all_fields( self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=loaded_spacy_nlp_engine) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults ] assert response.analyzeResults is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_and_entities_not_empty_exception(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None) def test_when_analyze_then_apptracer_has_value(self): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"] analyzer_engine_with_spacy = AnalyzerEngine( self.loaded_registry, app_tracer=self.app_tracer, enable_trace_pii=True, nlp_engine=TESTS_NLP_ENGINE) results = analyzer_engine_with_spacy.analyze( correlation_id=self.unit_test_guid, text=text, entities=entities, language=language, all_fields=False, trace=True) assert len(results) == 3 for result in results: assert result.analysis_explanation is not None assert self.app_tracer.get_msg_counter() == 2 assert self.app_tracer.get_last_trace() is not None def test_when_threshold_is_zero_all_results_pass(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False, score_threshold=0) assert len(results) == 2 def test_when_threshold_is_more_than_half_only_credit_card_passes(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False, score_threshold=0.51) assert len(results) == 1 def test_when_default_threshold_is_more_than_half_only_one_passes(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine(), default_score_threshold=0.7) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 1 def test_when_default_threshold_is_zero_all_results_pass(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one # also loads SpaCy so it can detect the phone number entity analyzer_engine = AnalyzerEngine(registry=self.loaded_registry, nlp_engine=MockNlpEngine()) results = analyzer_engine.analyze(self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 2 def test_demo_text(self): text = "Here are a few examples sentences we currently support:\n\n" \ "Hello, my name is David Johnson and I live in Maine.\n" \ "My credit card number is 4095-2609-9393-4932 and my " \ "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \ "On September 18 I visited microsoft.com and sent an " \ "email to [email protected], from the IP 192.168.0.1.\n\n" \ "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \ "Please transfer using this IBAN IL150120690000003111111.\n\n" \ "Can you please check the status on bank account 954567876544 " \ "in PresidiBank?\n\n" \ "" \ "Kate's social security number is 078-05-1120. " \ "Her driver license? it is 9234567B.\n\n" \ "" \ "This project welcomes contributions and suggestions.\n" \ "Most contributions require you to agree to a " \ "Contributor License Agreement (CLA) declaring " \ "that you have the right to, and actually do, " \ "grant us the rights to use your contribution. " \ "For details, visit https://cla.microsoft.com " \ "When you submit a pull request, " \ "a CLA-bot will automatically determine whether " \ "you need to provide a CLA and decorate the PR " \ "appropriately (e.g., label, comment).\n" \ "Simply follow the instructions provided by the bot. " \ "You will only need to do this once across all repos using our CLA.\n" \ "This project has adopted the Microsoft Open Source Code of Conduct.\n" \ "For more information see the Code of Conduct FAQ or " \ "contact [email protected] with any additional questions or comments." language = "en" analyzer_engine = AnalyzerEngine(default_score_threshold=0.35, nlp_engine=loaded_spacy_nlp_engine) results = analyzer_engine.analyze(correlation_id=self.unit_test_guid, text=text, entities=None, language=language, all_fields=True) for result in results: logger.info( "Entity = {}, Text = {}, Score={}, Start={}, End={}".format( result.entity_type, text[result.start:result.end], result.score, result.start, result.end)) detected_entities = [result.entity_type for result in results] assert len([ entity for entity in detected_entities if entity == "CREDIT_CARD" ]) == 1 assert len([ entity for entity in detected_entities if entity == "CRYPTO" ]) == 1 assert len([ entity for entity in detected_entities if entity == "DATE_TIME" ]) == 1 assert len([ entity for entity in detected_entities if entity == "DOMAIN_NAME" ]) == 4 assert len([ entity for entity in detected_entities if entity == "EMAIL_ADDRESS" ]) == 2 assert len([ entity for entity in detected_entities if entity == "IBAN_CODE" ]) == 1 assert len([ entity for entity in detected_entities if entity == "IP_ADDRESS" ]) == 1 assert len([ entity for entity in detected_entities if entity == "LOCATION" ]) == 1 assert len([ entity for entity in detected_entities if entity == "PERSON" ]) == 2 assert len([ entity for entity in detected_entities if entity == "PHONE_NUMBER" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_BANK_NUMBER" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_DRIVER_LICENSE" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_PASSPORT" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_SSN" ]) == 1 assert len(results) == 19 def test_get_recognizers_returns_predefined(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=loaded_spacy_nlp_engine) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers that detect the 17 entities assert len(response) == 15 def test_get_recognizers_returns_custom(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers and one custom assert len(response) == 16 rocket_recognizer = [ recognizer for recognizer in response if recognizer.name == "Rocket recognizer" and recognizer.entities == ["ROCKET"] and recognizer.language == "en" ] assert len(rocket_recognizer) == 1 def test_get_recognizers_returns_added_custom(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers assert len(response) == 15 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers and one custom assert len(response) == 16 def test_get_recognizers_returns_supported_language(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer RU", patterns=[pattern], supported_language="ru") recognizers_store_api_mock = RecognizerStoreApiMock() recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="ru") response = analyze_engine.GetAllRecognizers(request, None) # there is only 1 mocked russian recognizer assert len(response) == 1
class PresidioPIIAnalyzer(BaseAnalyzer): _analyzer: AnalyzerEngine = PrivateAttr() _anonymizer: AnonymizerEngine = PrivateAttr() TYPE: str = "PresidioPII" engine_config: Optional[PresidioEngineConfig] = None # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/ # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/ entity_recognizers: Optional[List[EntityRecognizer]] = None # To find more details refer https://microsoft.github.io/presidio/anonymizer/ anonymizers_config: Optional[Dict[str, OperatorConfig]] = None def __init__(self, **data: Any): super().__init__(**data) if not self.engine_config: self.engine_config = PresidioEngineConfig() # If spacy engine then load Spacy models and select languages languages = [] for model_config in self.engine_config.models: languages.append(model_config.lang_code) # Check SpacyNlpEngine.engine_name if self.engine_config.nlp_engine_name == "spacy": try: spacy_model = __import__(model_config.model_name) spacy_model.load() logger.info( f"Spacy model {model_config.model_name} is already downloaded" ) except: logger.warning( f"Spacy model {model_config.model_name} is not downloaded" ) logger.warning( f"Downloading spacy model {model_config.model_name}, it might take some time" ) from spacy.cli import download download(model_config.model_name) # Create NLP engine based on configuration provider = NlpEngineProvider( nlp_configuration=self.engine_config.dict()) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=languages) # self._analyzer.registry.load_predefined_recognizers() if self.entity_recognizers: for entity_recognizer in self.entity_recognizers: self._analyzer.registry.add_recognizer(entity_recognizer) # Initialize the anonymizer with logger self._anonymizer = AnonymizerEngine() def analyze_input( self, source_response_list: List[AnalyzerRequest], analyzer_config: PresidioPIIAnalyzerConfig, language: Optional[str] = "en", **kwargs, ) -> List[AnalyzerResponse]: analyzer_output: List[AnalyzerResponse] = [] for source_response in source_response_list: analyzer_result = self._analyzer.analyze( text=source_response.processed_text, entities=analyzer_config.entities, return_decision_process=analyzer_config. return_decision_process, language=language, ) anonymized_result = None if not analyzer_config.analyze_only: anonymizers_config = (analyzer_config.anonymizers_config or self.anonymizers_config) if (source_response.processed_text is not None and len(source_response.processed_text) > 0): anonymized_result = self._anonymizer.anonymize( text=source_response.processed_text, operators=anonymizers_config, analyzer_results=analyzer_result, ) if analyzer_config.replace_original_text and anonymized_result is not None: text = anonymized_result.text else: text = source_response.processed_text analyzer_output.append( AnalyzerResponse( processed_text=text, meta=source_response.meta, segmented_data={ "analyzer_result": [vars(result) for result in analyzer_result], "anonymized_result": None if not anonymized_result else [vars(item) for item in anonymized_result.items], "anonymized_text": None if not anonymized_result else anonymized_result.text, }, source_name=source_response.source_name, )) return analyzer_output
def test_demo_text(self): text = "Here are a few examples sentences we currently support:\n\n" \ "Hello, my name is David Johnson and I live in Maine.\n" \ "My credit card number is 4095-2609-9393-4932 and my " \ "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \ "On September 18 I visited microsoft.com and sent an " \ "email to [email protected], from the IP 192.168.0.1.\n\n" \ "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \ "Please transfer using this IBAN IL150120690000003111111.\n\n" \ "Can you please check the status on bank account 954567876544 " \ "in PresidiBank?\n\n" \ "" \ "Kate's social security number is 078-05-1120. " \ "Her driver license? it is 9234567B.\n\n" \ "" \ "This project welcomes contributions and suggestions.\n" \ "Most contributions require you to agree to a " \ "Contributor License Agreement (CLA) declaring " \ "that you have the right to, and actually do, " \ "grant us the rights to use your contribution. " \ "For details, visit https://cla.microsoft.com " \ "When you submit a pull request, " \ "a CLA-bot will automatically determine whether " \ "you need to provide a CLA and decorate the PR " \ "appropriately (e.g., label, comment).\n" \ "Simply follow the instructions provided by the bot. " \ "You will only need to do this once across all repos using our CLA.\n" \ "This project has adopted the Microsoft Open Source Code of Conduct.\n" \ "For more information see the Code of Conduct FAQ or " \ "contact [email protected] with any additional questions or comments." language = "en" analyzer_engine = AnalyzerEngine(default_score_threshold=0.35, nlp_engine=loaded_spacy_nlp_engine) results = analyzer_engine.analyze(correlation_id=self.unit_test_guid, text=text, entities=None, language=language, all_fields=True) for result in results: logger.info( "Entity = {}, Text = {}, Score={}, Start={}, End={}".format( result.entity_type, text[result.start:result.end], result.score, result.start, result.end)) detected_entities = [result.entity_type for result in results] assert len([ entity for entity in detected_entities if entity == "CREDIT_CARD" ]) == 1 assert len([ entity for entity in detected_entities if entity == "CRYPTO" ]) == 1 assert len([ entity for entity in detected_entities if entity == "DATE_TIME" ]) == 1 assert len([ entity for entity in detected_entities if entity == "DOMAIN_NAME" ]) == 4 assert len([ entity for entity in detected_entities if entity == "EMAIL_ADDRESS" ]) == 2 assert len([ entity for entity in detected_entities if entity == "IBAN_CODE" ]) == 1 assert len([ entity for entity in detected_entities if entity == "IP_ADDRESS" ]) == 1 assert len([ entity for entity in detected_entities if entity == "LOCATION" ]) == 1 assert len([ entity for entity in detected_entities if entity == "PERSON" ]) == 2 assert len([ entity for entity in detected_entities if entity == "PHONE_NUMBER" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_BANK_NUMBER" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_DRIVER_LICENSE" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_PASSPORT" ]) == 1 assert len([ entity for entity in detected_entities if entity == "US_SSN" ]) == 1 assert len(results) == 19
def test_when_demo_text_then_return_results(unit_test_guid, nlp_engine): dir_path = Path(__file__).resolve().parent with open(Path(dir_path, "data", "demo.txt"), encoding="utf-8") as f: text_into_rows = f.read().split("\n") text_into_rows = [txt.strip() for txt in text_into_rows] text = " ".join(text_into_rows) language = "en" analyzer_engine = AnalyzerEngine(default_score_threshold=0.35, nlp_engine=nlp_engine) results = analyzer_engine.analyze( correlation_id=unit_test_guid, text=text, entities=None, language=language, ) def replace_with_entity_name(original_text: str, responses: List[RecognizerResult]): """ Performs replacements for every entity with its entity type """ delta = 0 new_text = original_text responses = sorted(responses, key=lambda x: x.start) for i, resp in enumerate(responses): # check if this response is already contained in a previous one if len([prev for prev in responses[:i] if resp.contained_in(prev)]) > 0: continue start = resp.start + delta end = resp.end + delta entity_text = original_text[resp.start:resp.end] entity_type = resp.entity_type new_text = f"{new_text[:start]}<{entity_type}>{new_text[end:]}" delta += len(entity_type) + 2 - len(entity_text) return new_text actual_anonymized_text = replace_with_entity_name(text, results) for result in results: text_slice = slice(result.start, result.end) print("Entity = {}, Text = {}, Score={}, Start={}, End={}".format( result.entity_type, text[text_slice], result.score, result.start, result.end, )) with open(Path(dir_path, "data", "demo_anonymized.txt"), encoding="utf-8") as f_exp: text_into_rows = f_exp.read().split("\n") text_into_rows = [txt.strip() for txt in text_into_rows] expected_anonymized_text = " ".join(text_into_rows) # assert len(results) == 19 assert expected_anonymized_text == actual_anonymized_text
from presidio_analyzer import AnalyzerEngine, PatternRecognizer from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities.engine import OperatorConfig text_to_anonymize = "His name is Tom and his phone number is 212-555-5555" analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en') print("\nPII Detection:") print(analyzer_results) anonymized_results = anonymizer.anonymize( text=text_to_anonymize, analyzer_results=analyzer_results, operators={ "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}) }) print("\nPII Anonymization:") print(anonymized_results.to_json())
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern analyzer = AnalyzerEngine() text1 = "Professor Plum, in the Dining Room, with the candlestick" titles_list = [ "Sir", "Ma'am", "Madam", "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Professor" ] titles_recognizer = PatternRecognizer(supported_entity="TITLE", deny_list=titles_list) analyzer.registry.add_recognizer(titles_recognizer) result = analyzer.analyze(text=text1, language='en') print(f"\nDeny List result:\n {result}") text2 = "I live in 510 Broad st." numbers_pattern = Pattern(name="numbers_pattern", regex="\d+", score=0.5) number_recognizer = PatternRecognizer(supported_entity="NUMBER", patterns=[numbers_pattern]) result = number_recognizer.analyze(text=text2, entities=["NUMBER"]) print(f"\nRegex result:\n {result}")
def anonymize_reverse_lambda(analyzer_results, text_to_anonymize): anonymized_results = anonymizer.anonymize( text=text_to_anonymize, analyzer_results=analyzer_results, operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: x[::-1]})} ) return anonymized_results def anonymize_faker_lambda(analyzer_results, text_to_anonymize): anonymized_results = anonymizer.anonymize( text=text_to_anonymize, analyzer_results=analyzer_results, operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()})} ) return anonymized_results fake = Faker('en_US') fake.add_provider(internet) analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() text = 'The user has the following two emails: [email protected] and [email protected]' analyzer_results = analyzer.analyze(text=text, entities=["EMAIL_ADDRESS"], language='en') print("Origina Text: ", text) print("Analyzer result:", analyzer_results, '\n') print("Reverse lambda result: ",anonymize_reverse_lambda(analyzer_results, text).text, '\n') print("Faker lambda result: ",anonymize_faker_lambda(analyzer_results, text).text, '\n')
print("\nDocument properties for", doc) print("Author:", document.core_properties.author) print("Last Modified By:", document.core_properties.last_modified_by) print("Date:", document.core_properties.modified) print("\n===============================================") paras = document.paragraphs doctext = "" for i in paras: doctext += i.text print("\n===============================================") print("Extracted text from", doc) print(doctext) print("\n===============================================") engine = AnalyzerEngine() response = engine.analyze(correlation_id=0, text = doctext, entities=[], language='en', all_fields=True, score_threshold=0.5) for item in response: print("Start = {}, end = {}, entity = {}, confidence = {}".format(item.start, item.end, item.entity_type, item.score))
def __init__(self, language='en'): self.analyzer = AnalyzerEngine() self.anonymizer = AnonymizerEngine() self.language = language
def __init__(self): self.analyzer = AnalyzerEngine() self.anonymizer = AnonymizerEngine()
def test_when_read_test_spacy_nlp_conf_file_then_returns_spacy_nlp_engine( mock_registry, ): engine = AnalyzerEngine(registry=mock_registry) assert isinstance(engine.nlp_engine, SpacyNlpEngine) assert engine.nlp_engine.nlp is not None
def analyze(text): analyzer = AnalyzerEngine() analyzer_results = analyzer.analyze(text=text, language='en') return analyzer_results
Load the Presidio-supported TextAnalyticsEntityCategory from a yaml configuration file. """ categories_file = yaml.safe_load(open(file_location)) return [ TextAnalyticsEntityCategory(**category) for category in categories_file ] if __name__ == "__main__": import os from presidio_analyzer import AnalyzerEngine # Instruction for setting up Text Analytics and fetch instance key and endpoint: # https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/cognitive-services/text-analytics/includes/create-text-analytics-resource.md text_analytics_recognizer = TextAnalyticsRecognizer( text_analytics_key="<YOUR_TEXT_ANALYTICS_KEY>", text_analytics_endpoint="<YOUR_TEXT_ANALYTICS_ENDPOINT>", categories_file_location=os.path.join( os.path.dirname(__file__), "example_text_analytics_entity_categories.yaml"), ) analyzer = AnalyzerEngine() analyzer.registry.add_recognizer(text_analytics_recognizer) results = analyzer.analyze( text="David is 30 years old. His IBAN: IL150120690000003111111", language="en") print(results)