def test_remove_pattern_recognizer(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Expects one custom recognizer recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0
def test_context_custom_recognizer(self): nlp_engine = SpacyNlpEngine() mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") # This test checks that a custom recognizer is also enhanced by context. # However this test also verifies a specific case in which the pattern also # includes a preceeding space (' rocket'). This in turn cause for a misalignment # between the tokens and the regex match (the token will be just 'rocket'). # This misalignment is handled in order to find the correct context window. rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", name="rocketrecognizer", context=["cool"], patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)]) text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert (len(results_without_context) == len(results_with_context)) for i in range(len(results_with_context)): assert (results_without_context[i].score < results_with_context[i].score)
def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def get_all_recognizers(self): """ Returns a list of CustomRecognizer which were created from the recognizers stored in the underlying store """ req = recognizers_store_pb2.RecognizersGetAllRequest() raw_recognizers = [] try: raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers except grpc.RpcError: logging.info("Failed getting recognizers from the remote store. \ Returning an empty list") return raw_recognizers custom_recognizers = [] for new_recognizer in raw_recognizers: patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer( name=new_recognizer.name, supported_entity=new_recognizer.entity, supported_language=new_recognizer.language, black_list=new_recognizer.blacklist, context=new_recognizer.contextPhrases, patterns=patterns) custom_recognizers.append(new_custom_recognizer) return custom_recognizers
def test_from_dict(self): json = {'supported_entity': 'ENTITY_1', 'supported_language': 'en', 'patterns': [{'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}], 'context': ['w1', 'w2', 'w3'], 'version': "1.0"} new_recognizer = PatternRecognizer.from_dict(json) ### consider refactoring assertions assert new_recognizer.supported_entities == ['ENTITY_1'] assert new_recognizer.supported_language == 'en' assert new_recognizer.patterns[0].name == 'p1' assert new_recognizer.patterns[0].score == 0.5 assert new_recognizer.patterns[0].regex == '([0-9]{1,9})' assert new_recognizer.context == ['w1', 'w2', 'w3'] assert new_recognizer.version == "1.0"
def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0
def add_custom_pattern_recognizer(self, new_recognizer, skip_hash_update=False): patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0], supported_language=new_recognizer.supported_language, black_list=new_recognizer.black_list, context=new_recognizer.context, patterns=patterns) self.recognizers.append(new_custom_recognizer) if skip_hash_update: return m = hashlib.md5() for recognizer in self.recognizers: m.update(recognizer.name.encode('utf-8')) self.latest_hash = m.digest()
def test_add_pattern_recognizer(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 assert recognizers[0].patterns[0].name == "rocket pattern" assert recognizers[0].name == "Rocket recognizer"
def test_cache_logic(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Negative flow recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) custom_recognizers = recognizer_registry.get_custom_recognizers() # Nothing should be returned assert len(custom_recognizers) == 0 # Since no hash was returned, then no access to storage is expected assert recognizers_store_api_mock.times_accessed_storage == 0 # Add a new recognizer recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=True) # Since the hash wasn't updated the recognizers are stale from the cache # without the newly added one custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 0 # And we also didn't accessed the underlying storage assert recognizers_store_api_mock.times_accessed_storage == 0 # Positive flow # Now do the same only this time update the hash so it should work properly recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizer_registry.get_custom_recognizers() assert recognizers_store_api_mock.times_accessed_storage == 0 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=False) custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 1 # Accessed again assert recognizers_store_api_mock.times_accessed_storage == 1
def test_from_dict_returns_instance(self): pattern1_dict = {'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'} pattern2_dict = {'name': 'p2', 'score': 0.8, 'regex': '([0-9]{1,9})'} ent_rec_dict = {"supported_entity": "A", "supported_language": "he", "patterns": [pattern1_dict, pattern2_dict] } pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict) assert pattern_recognizer.supported_entities == ["A"] assert pattern_recognizer.supported_language == "he" assert pattern_recognizer.version == "0.0.1" assert pattern_recognizer.patterns[0].name == "p1" assert pattern_recognizer.patterns[0].score == 0.5 assert pattern_recognizer.patterns[0].regex == '([0-9]{1,9})' assert pattern_recognizer.patterns[1].name == "p2" assert pattern_recognizer.patterns[1].score == 0.8 assert pattern_recognizer.patterns[1].regex == '([0-9]{1,9})'
def get_mock_pattern_recognizer(self, lang, entity, name): return PatternRecognizer( supported_entity=entity, supported_language=lang, name=name, patterns=[Pattern("pat", regex="REGEX", score=1.0)])