def create_mock_pattern_recognizer(lang, entity, name): return PatternRecognizer( supported_entity=entity, supported_language=lang, name=name, patterns=[Pattern("pat", regex="REGEX", score=1.0)], )
def test_remove_pattern_recognizer(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Expects one custom recognizer recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0
def zip_code_deny_list_recognizer(): regex = r"(\b\d{5}(?:\-\d{4})?\b)" zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) zip_recognizer = PatternRecognizer(supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern]) return zip_recognizer
def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def get_all_recognizers(self): """ Returns a list of CustomRecognizer which were created from the recognizers stored in the underlying store """ req = recognizers_store_pb2.RecognizersGetAllRequest() raw_recognizers = [] try: raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers except grpc.RpcError: logger.info("Failed getting recognizers from the remote store. \ Returning an empty list") return raw_recognizers custom_recognizers = [] for new_recognizer in raw_recognizers: patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer( name=new_recognizer.name, supported_entity=new_recognizer.entity, supported_language=new_recognizer.language, black_list=new_recognizer.blacklist, context=new_recognizer.contextPhrases, patterns=patterns) custom_recognizers.append(new_custom_recognizer) return custom_recognizers
def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifacts): """This test checks that a custom recognizer is also enhanced by context. However this test also verifies a specific case in which the pattern also includes a preceeding space (' rocket'). This in turn cause for a misalignment between the tokens and the regex match (the token will be just 'rocket'). This misalignment is handled in order to find the correct context window. """ rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", name="rocketrecognizer", context=["cool"], patterns=[Pattern("rocketpattern", r"\\s+(rocket)", 0.3)], ) text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert len(results_without_context) == len(results_with_context) for res_wo, res_w in zip(results_without_context, results_with_context): assert res_wo.score < res_w.score
def test_when_ad_hoc_deny_list_recognizer_is_added_then_result_contains_result( loaded_analyzer_engine, ): text = "Mr. John Smith's drivers license is AC432223" mr_recognizer = PatternRecognizer(supported_entity="MR", deny_list=["Mr.", "Mr"]) responses = loaded_analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[mr_recognizer]) detected_entities = [response.entity_type for response in responses] assert "MR" in detected_entities
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist In a sentence we could get a false positive at the end of our regex, were we want to find the IBAN but not the false positive at the end of the match. i.e. "I want my deposit in DE89370400440532013000 2 days from today." :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: matches = re.finditer(pattern.regex, text, flags=self.flags) for match in matches: for grp_num in reversed(range(1, len(match.groups()) + 1)): start = match.span(0)[0] end = ( match.span(grp_num)[1] if match.span(grp_num)[1] > 0 else match.span(0)[1] ) current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) break return results
def __init__(self, req_data: Dict): self.text = req_data.get("text") self.language = req_data.get("language") self.entities = req_data.get("entities") self.correlation_id = req_data.get("correlation_id") self.score_threshold = req_data.get("score_threshold") self.return_decision_process = req_data.get("return_decision_process") ad_hoc_recognizers = req_data.get("ad_hoc_recognizers") self.ad_hoc_recognizers = [] if ad_hoc_recognizers: self.ad_hoc_recognizers = [ PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers ]
def test_removed_pattern_recognizer_doesnt_work(unit_test_guid): pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=NlpEngineMock(), ) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", all_fields=False, ) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", all_fields=False, ) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", all_fields=False, ) assert len(results) == 0
def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path, entity_name, pattern, score, num_of_examples, acceptance_threshold, max_mistakes_number): """ Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities and another CSV values file with a custom entity :param pii_csv: input csv file location with the common entities :param ext_csv: input csv file location with custom entities :param utterances: template file location :param dictionary_path: vocabulary/dictionary file location :param entity_name: custom entity name :param pattern: recognizer pattern :param num_of_examples: number of samples to be used from dataset to test :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ import os dir_path = os.path.dirname(os.path.realpath(__file__)) dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8') dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8') dictionary_path = dictionary_path.format(dir_path) ext_column_name = dfext.columns[0] def get_from_ext(i): index = i % dfext.shape[0] return dfext.iat[index, 0] # extend pii with ext data dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])] # generate examples generator = FakeDataGenerator(fake_pii_csv_file=dfpii, utterances_file=utterances.format(dir_path), dictionary_path=dictionary_path) examples = generator.sample_examples(num_of_examples) pattern = Pattern("test pattern", pattern, score) pattern_recognizer = PatternRecognizer(entity_name, name="test recognizer", patterns=[pattern]) scores = score_presidio_recognizer( pattern_recognizer, [entity_name], examples) if not np.isnan(scores.pii_f): assert acceptance_threshold <= scores.pii_f assert max_mistakes_number >= len(scores.model_errors)
def test_from_dict(self): json = {'supported_entity': 'ENTITY_1', 'supported_language': 'en', 'patterns': [{'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}], 'context': ['w1', 'w2', 'w3'], 'version': "1.0"} new_recognizer = PatternRecognizer.from_dict(json) ### consider refactoring assertions assert new_recognizer.supported_entities == ['ENTITY_1'] assert new_recognizer.supported_language == 'en' assert new_recognizer.patterns[0].name == 'p1' assert new_recognizer.patterns[0].score == 0.5 assert new_recognizer.patterns[0].regex == '([0-9]{1,9})' assert new_recognizer.context == ['w1', 'w2', 'w3'] assert new_recognizer.version == "1.0"
def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine): text = "Mr. John Smith's zip code is 10002" mr_recognizer = PatternRecognizer(supported_entity="MR", deny_list=["Mr.", "Mr"]) responses1 = loaded_analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[mr_recognizer]) responses2 = loaded_analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[mr_recognizer], entities=["PERSON"], ) assert "MR" in [resp.entity_type for resp in responses1] assert "MR" not in [resp.entity_type for resp in responses2]
def test_when_get_supported_fields_specific_language_then_return_single_result( loaded_registry, unit_test_guid, nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer RU", patterns=[pattern], supported_language="ru", ) analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine) analyzer.registry.add_recognizer(pattern_recognizer) entities = analyzer.get_supported_entities(language="ru") assert len(entities) == 1 assert "ROCKET" in entities
def test_when_add_pattern_recognizer_then_item_added(): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer", patterns=[pattern] ) # Create an empty recognizer registry recognizer_registry = RecognizerRegistry(recognizers=[]) assert len(recognizer_registry.recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizer_registry.add_recognizer(pattern_recognizer) assert len(recognizer_registry.recognizers) == 1 assert recognizer_registry.recognizers[0].patterns[0].name == "rocket pattern" assert recognizer_registry.recognizers[0].name == "Rocket recognizer"
def test_when_get_recognizers_then_returns_supported_language(): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer RU", patterns=[pattern], supported_language="ru", ) mock_recognizer_registry = RecognizerRegistryMock() mock_recognizer_registry.add_recognizer(pattern_recognizer) analyze_engine = AnalyzerEngine( registry=mock_recognizer_registry, nlp_engine=NlpEngineMock(), ) response = analyze_engine.get_recognizers(language="ru") # there is only 1 mocked russian recognizer assert len(response) == 1
def test_get_recognizers_returns_supported_language(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer RU", patterns=[pattern], supported_language="ru") recognizers_store_api_mock = RecognizerStoreApiMock() recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="ru") response = analyze_engine.GetAllRecognizers(request, None) # there is only 1 mocked russian recognizer assert len(response) == 1
def test_add_pattern_recognizer(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 assert recognizers[0].patterns[0].name == "rocket pattern" assert recognizers[0].name == "Rocket recognizer"
def test_when_remove_pattern_recognizer_then_item_removed(): pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8) pattern_recognizer = PatternRecognizer( "SPACESHIP", name="Spaceship recognizer", patterns=[pattern] ) # Create an empty recognizer registry recognizer_registry = RecognizerRegistry(recognizers=[]) assert len(recognizer_registry.recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizer_registry.add_recognizer(pattern_recognizer) # Expects one custom recognizer assert len(recognizer_registry.recognizers) == 1 # Remove recognizer recognizer_registry.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers assert len(recognizer_registry.recognizers) == 0
def test_when_add_recognizer_then_also_outputs_others(nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer", patterns=[pattern], supported_language="en", ) registry = RecognizerRegistry() registry.add_recognizer(pattern_recognizer) registry.load_predefined_recognizers() assert len(registry.recognizers) > 1 analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) text = "Michael Jones has a rocket" results = analyzer.analyze(text=text, language="en") assert len(results) == 2
def test_get_recognizers_returns_added_custom(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers assert len(response) == 15 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers and one custom assert len(response) == 16
def test_from_dict_returns_instance(self): pattern1_dict = {'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'} pattern2_dict = {'name': 'p2', 'score': 0.8, 'regex': '([0-9]{1,9})'} ent_rec_dict = {"supported_entity": "A", "supported_language": "he", "patterns": [pattern1_dict, pattern2_dict] } pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict) assert pattern_recognizer.supported_entities == ["A"] assert pattern_recognizer.supported_language == "he" assert pattern_recognizer.version == "0.0.1" assert pattern_recognizer.patterns[0].name == "p1" assert pattern_recognizer.patterns[0].score == 0.5 assert pattern_recognizer.patterns[0].regex == '([0-9]{1,9})' assert pattern_recognizer.patterns[1].name == "p2" assert pattern_recognizer.patterns[1].score == 0.8 assert pattern_recognizer.patterns[1].regex == '([0-9]{1,9})'
def add_custom_pattern_recognizer(self, new_recognizer, skip_hash_update=False): patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer( name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0], supported_language=new_recognizer.supported_language, black_list=new_recognizer.black_list, context=new_recognizer.context, patterns=patterns) self.recognizers.append(new_custom_recognizer) if skip_hash_update: return m = hashlib.md5() for recognizer in self.recognizers: m.update(recognizer.name.encode('utf-8')) self.latest_hash = m.digest()
def test_from_dict_returns_instance(): pattern1_dict = {"name": "p1", "score": 0.5, "regex": "([0-9]{1,9})"} pattern2_dict = {"name": "p2", "score": 0.8, "regex": "([0-9]{1,9})"} ent_rec_dict = { "supported_entity": "A", "supported_language": "he", "patterns": [pattern1_dict, pattern2_dict], } pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict) assert pattern_recognizer.supported_entities == ["A"] assert pattern_recognizer.supported_language == "he" assert pattern_recognizer.version == "0.0.1" assert pattern_recognizer.patterns[0].name == "p1" assert pattern_recognizer.patterns[0].score == 0.5 assert pattern_recognizer.patterns[0].regex == "([0-9]{1,9})" assert pattern_recognizer.patterns[1].name == "p2" assert pattern_recognizer.patterns[1].score == 0.8 assert pattern_recognizer.patterns[1].regex == "([0-9]{1,9})"
def test_from_dict(): json = { "supported_entity": "ENTITY_1", "supported_language": "en", "patterns": [{ "name": "p1", "score": 0.5, "regex": "([0-9]{1,9})" }], "context": ["w1", "w2", "w3"], "version": "1.0", } new_recognizer = PatternRecognizer.from_dict(json) # consider refactoring assertions assert new_recognizer.supported_entities == ["ENTITY_1"] assert new_recognizer.supported_language == "en" assert new_recognizer.patterns[0].name == "p1" assert new_recognizer.patterns[0].score == 0.5 assert new_recognizer.patterns[0].regex == "([0-9]{1,9})" assert new_recognizer.context == ["w1", "w2", "w3"] assert new_recognizer.version == "1.0"
def test_cache_logic(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Negative flow recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) custom_recognizers = recognizer_registry.get_custom_recognizers() # Nothing should be returned assert len(custom_recognizers) == 0 # Since no hash was returned, then no access to storage is expected assert recognizers_store_api_mock.times_accessed_storage == 0 # Add a new recognizer recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=True) # Since the hash wasn't updated the recognizers are stale from the cache # without the newly added one custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 0 # And we also didn't accessed the underlying storage assert recognizers_store_api_mock.times_accessed_storage == 0 # Positive flow # Now do the same only this time update the hash so it should work properly recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizer_registry.get_custom_recognizers() assert recognizers_store_api_mock.times_accessed_storage == 0 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=False) custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 1 # Accessed again assert recognizers_store_api_mock.times_accessed_storage == 1
def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) mock_recognizer_registry = RecognizerRegistryMock() # Make sure the analyzer doesn't get this entity analyze_engine = AnalyzerEngine( registry=mock_recognizer_registry, nlp_engine=NlpEngineMock(), ) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", ) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) mock_recognizer_registry.add_recognizer(pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze( correlation_id=unit_test_guid, text=text, entities=entities, language="en", ) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def test_get_recognizers_returns_custom(): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) recognizers_store_api_mock = RecognizerStoreApiMock() recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=NlpEngineMock(), ) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers and one custom assert len(response) == 16 rocket_recognizer = [ recognizer for recognizer in response if recognizer.name == "Rocket recognizer" and recognizer.entities == ["ROCKET"] and recognizer.language == "en" ] assert len(rocket_recognizer) == 1
def test_context_custom_recognizer(self): nlp_engine = TESTS_NLP_ENGINE mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") # This test checks that a custom recognizer is also enhanced by context. # However this test also verifies a specific case in which the pattern also # includes a preceeding space (' rocket'). This in turn cause for a misalignment # between the tokens and the regex match (the token will be just 'rocket'). # This misalignment is handled in order to find the correct context window. rocket_recognizer = PatternRecognizer(supported_entity="ROCKET", name="rocketrecognizer", context=["cool"], patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)]) text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert(len(results_without_context) == len(results_with_context)) for i in range(len(results_with_context)): assert(results_without_context[i].score < results_with_context[i].score)
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern analyzer = AnalyzerEngine() text1 = "Professor Plum, in the Dining Room, with the candlestick" titles_list = [ "Sir", "Ma'am", "Madam", "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Professor" ] titles_recognizer = PatternRecognizer(supported_entity="TITLE", deny_list=titles_list) analyzer.registry.add_recognizer(titles_recognizer) result = analyzer.analyze(text=text1, language='en') print(f"\nDeny List result:\n {result}") text2 = "I live in 510 Broad st." numbers_pattern = Pattern(name="numbers_pattern", regex="\d+", score=0.5) number_recognizer = PatternRecognizer(supported_entity="NUMBER", patterns=[numbers_pattern]) result = number_recognizer.analyze(text=text2, entities=["NUMBER"]) print(f"\nRegex result:\n {result}")