def __init__(self): patterns = [ Pattern('Nric (weak) ', WEAK_REGEX, 0.3), Pattern('Nric (medium) ', MEDIUM_REGEX, 0.5), ] super().__init__(supported_entity="SG_NRIC_FIN", patterns=patterns, context=CONTEXT)
def __init__(self): patterns = [Pattern('Driver License - WA (weak) ', WA_WEAK_REGEX, 0.4), Pattern('Driver License - Alphanumeric (weak) ', ALPHANUMERIC_REGEX, 0.3), Pattern('Driver License - Digits (very weak)', DIGITS_REGEX, 0.01)] super().__init__(supported_entity="US_DRIVER_LICENSE", patterns=patterns, context=LICENSE_CONTEXT)
def __init__(self): patterns = [ Pattern('IPv4', IP_V4_REGEX, 0.6), Pattern('IPv6', IP_V6_REGEX, 0.6) ] super().__init__(supported_entity="IP_ADDRESS", patterns=patterns, context=IP_CONTEXT)
def __init__(self): patterns = [ Pattern('SSN (very weak)', VERY_WEAK_REGEX, 0.05), Pattern('SSN (weak)', WEAK_REGEX, 0.3), Pattern('SSN (medium)', MEDIUM_REGEX, 0.5) ] super().__init__(supported_entity="US_SSN", patterns=patterns, context=CONTEXT)
def test_no_entity_for_pattern_recognizer(self): with pytest.raises(ValueError): patterns = [ Pattern("p1", "someregex", 1.0), Pattern("p1", "someregex", 0.5) ] MockRecognizer(entity=[], patterns=patterns, black_list=[], name=None, context=None)
def __init__(self): patterns = [ Pattern('Phone (strong)', UsPhoneRecognizer.STRONG_REGEX, UsPhoneRecognizer.STRONG_REGEX_SCORE), Pattern('Phone (medium)', UsPhoneRecognizer.MEDIUM_REGEX, UsPhoneRecognizer.MEDIUM_REGEX_SCORE), Pattern('Phone (weak)', UsPhoneRecognizer.WEAK_REGEX, UsPhoneRecognizer.WEAK_REGEX_SCORE) ] super().__init__(supported_entity="PHONE_NUMBER", patterns=patterns, context=CONTEXT)
def get_all_recognizers(self): """ Returns a list of CustomRecognizer which were created from the recognizers stored in the underlying store """ req = recognizers_store_pb2.RecognizersGetAllRequest() raw_recognizers = [] try: raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers except grpc.RpcError: logging.info("Failed getting recognizers from the remote store. \ Returning an empty list") return raw_recognizers custom_recognizers = [] for new_recognizer in raw_recognizers: patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer( name=new_recognizer.name, supported_entity=new_recognizer.entity, supported_language=new_recognizer.language, black_list=new_recognizer.blacklist, context=new_recognizer.contextPhrases, patterns=patterns) custom_recognizers.append(new_custom_recognizer) return custom_recognizers
def from_dict(cls, entity_recognizer_dict): patterns = entity_recognizer_dict.get("patterns") if patterns: patterns_list = [Pattern.from_dict(pat) for pat in patterns] entity_recognizer_dict['patterns'] = patterns_list return cls(**entity_recognizer_dict)
def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "ROCKET", 0, 7, 0.8)
def test_remove_pattern_recognizer(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Expects one custom recognizer recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0
def __init__(self): patterns = [ Pattern('IBAN Generic', IBAN_GENERIC_REGEX, IBAN_GENERIC_SCORE) ] super().__init__(supported_entity="IBAN_CODE", patterns=patterns, context=CONTEXT)
def test_from_dict(self): expected = my_pattern actual = Pattern.from_dict(my_pattern_dict) assert expected.name == actual.name assert expected.score == actual.score assert expected.regex == actual.regex
def test_context_custom_recognizer(self): nlp_engine = SpacyNlpEngine() mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") # This test checks that a custom recognizer is also enhanced by context. # However this test also verifies a specific case in which the pattern also # includes a preceeding space (' rocket'). This in turn cause for a misalignment # between the tokens and the regex match (the token will be just 'rocket'). # This misalignment is handled in order to find the correct context window. rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", name="rocketrecognizer", context=["cool"], patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)]) text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert (len(results_without_context) == len(results_with_context)) for i in range(len(results_with_context)): assert (results_without_context[i].score < results_with_context[i].score)
def __black_list_to_regex(black_list): """ Converts a list of word to a matching regex, to be analyzed by the regex engine as a part of the analyze logic :param black_list: the list of words to detect :return:the regex of the words for detection """ regex = r"(?:^|(?<= ))(" + '|'.join(black_list) + r")(?:(?= )|$)" return Pattern(name="black_list", regex=regex, score=1.0)
def test_removed_pattern_recognizer_doesnt_work(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() analyze_engine = AnalyzerEngine( registry=MockRecognizerRegistry(recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Test again to see we didn't get any results results = analyze_engine.analyze(self.unit_test_guid, text=text, entities=entities, language='en', all_fields=False) assert len(results) == 0
def add_custom_pattern_recognizer(self, new_recognizer, skip_hash_update=False): patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0], supported_language=new_recognizer.supported_language, black_list=new_recognizer.black_list, context=new_recognizer.context, patterns=patterns) self.recognizers.append(new_custom_recognizer) if skip_hash_update: return m = hashlib.md5() for recognizer in self.recognizers: m.update(recognizer.name.encode('utf-8')) self.latest_hash = m.digest()
def test_add_pattern_recognizer(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 assert recognizers[0].patterns[0].name == "rocket pattern" assert recognizers[0].name == "Rocket recognizer"
def test_cache_logic(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Negative flow recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) custom_recognizers = recognizer_registry.get_custom_recognizers() # Nothing should be returned assert len(custom_recognizers) == 0 # Since no hash was returned, then no access to storage is expected assert recognizers_store_api_mock.times_accessed_storage == 0 # Add a new recognizer recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=True) # Since the hash wasn't updated the recognizers are stale from the cache # without the newly added one custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 0 # And we also didn't accessed the underlying storage assert recognizers_store_api_mock.times_accessed_storage == 0 # Positive flow # Now do the same only this time update the hash so it should work properly recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizer_registry.get_custom_recognizers() assert recognizers_store_api_mock.times_accessed_storage == 0 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=False) custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 1 # Accessed again assert recognizers_store_api_mock.times_accessed_storage == 1
def __init__(self): patterns = [Pattern('Passport (very weak)', VERY_WEAK_REGEX, 0.2)] super().__init__(supported_entity="US_PASSPORT", patterns=patterns, context=CONTEXT)
def get_mock_pattern_recognizer(self, lang, entity, name): return PatternRecognizer( supported_entity=entity, supported_language=lang, name=name, patterns=[Pattern("pat", regex="REGEX", score=1.0)])
def __init__(self): patterns = [Pattern('Email (Medium)', REGEX, 0.5)] super().__init__(supported_entity="EMAIL_ADDRESS", patterns=patterns, context=CONTEXT)
def __init__(self): patterns = [Pattern('Crypto (Medium)', REGEX, 0.5)] super().__init__(supported_entity="CRYPTO", patterns=patterns, context=CONTEXT)
def __init__(self): patterns = [Pattern('All Credit Cards (weak)', REGEX, 0.2)] super().__init__(supported_entity="CREDIT_CARD", patterns=patterns, context=CONTEXT)
def __init__(self): patterns = [Pattern('Domain ()', REGEX, 0.5)] super().__init__(supported_entity="DOMAIN_NAME", patterns=patterns, context=CONTEXT)
def __init__(self): patterns = [Pattern('NHS (medium)', REGEX, 0.5)] super().__init__(supported_entity="UK_NHS", patterns=patterns, context=CONTEXT)
from unittest import TestCase from analyzer import Pattern my_pattern = Pattern(name="my pattern", score=0.9, regex="[re]") my_pattern_dict = {"name": "my pattern", "regex": "[re]", "score": 0.9} class TestPattern(TestCase): def test_to_dict(self): expected = my_pattern_dict actual = my_pattern.to_dict() assert expected == actual def test_from_dict(self): expected = my_pattern actual = Pattern.from_dict(my_pattern_dict) assert expected.name == actual.name assert expected.score == actual.score assert expected.regex == actual.regex
def __init__(self): patterns = [Pattern('Bank Account (weak)', REGEX, 0.05)] super().__init__(supported_entity="US_BANK_NUMBER", patterns=patterns, context=CONTEXT)