예제 #1
0
 def __init__(self):
     patterns = [
         Pattern('Nric (weak) ', WEAK_REGEX, 0.3),
         Pattern('Nric (medium) ', MEDIUM_REGEX, 0.5),
     ]
     super().__init__(supported_entity="SG_NRIC_FIN",
                      patterns=patterns,
                      context=CONTEXT)
 def __init__(self):
     patterns = [Pattern('Driver License - WA (weak) ', WA_WEAK_REGEX, 0.4),
                 Pattern('Driver License - Alphanumeric (weak) ',
                         ALPHANUMERIC_REGEX, 0.3),
                 Pattern('Driver License - Digits (very weak)',
                         DIGITS_REGEX, 0.01)]
     super().__init__(supported_entity="US_DRIVER_LICENSE",
                      patterns=patterns, context=LICENSE_CONTEXT)
예제 #3
0
 def __init__(self):
     patterns = [
         Pattern('IPv4', IP_V4_REGEX, 0.6),
         Pattern('IPv6', IP_V6_REGEX, 0.6)
     ]
     super().__init__(supported_entity="IP_ADDRESS",
                      patterns=patterns,
                      context=IP_CONTEXT)
예제 #4
0
 def __init__(self):
     patterns = [
         Pattern('SSN (very weak)', VERY_WEAK_REGEX, 0.05),
         Pattern('SSN (weak)', WEAK_REGEX, 0.3),
         Pattern('SSN (medium)', MEDIUM_REGEX, 0.5)
     ]
     super().__init__(supported_entity="US_SSN",
                      patterns=patterns,
                      context=CONTEXT)
예제 #5
0
 def test_no_entity_for_pattern_recognizer(self):
     with pytest.raises(ValueError):
         patterns = [
             Pattern("p1", "someregex", 1.0),
             Pattern("p1", "someregex", 0.5)
         ]
         MockRecognizer(entity=[],
                        patterns=patterns,
                        black_list=[],
                        name=None,
                        context=None)
예제 #6
0
 def __init__(self):
     patterns = [
         Pattern('Phone (strong)', UsPhoneRecognizer.STRONG_REGEX,
                 UsPhoneRecognizer.STRONG_REGEX_SCORE),
         Pattern('Phone (medium)', UsPhoneRecognizer.MEDIUM_REGEX,
                 UsPhoneRecognizer.MEDIUM_REGEX_SCORE),
         Pattern('Phone (weak)', UsPhoneRecognizer.WEAK_REGEX,
                 UsPhoneRecognizer.WEAK_REGEX_SCORE)
     ]
     super().__init__(supported_entity="PHONE_NUMBER",
                      patterns=patterns,
                      context=CONTEXT)
예제 #7
0
    def get_all_recognizers(self):
        """
        Returns a list of CustomRecognizer which were created from the
        recognizers stored in the underlying store
        """
        req = recognizers_store_pb2.RecognizersGetAllRequest()
        raw_recognizers = []

        try:
            raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers

        except grpc.RpcError:
            logging.info("Failed getting recognizers from the remote store. \
            Returning an empty list")
            return raw_recognizers

        custom_recognizers = []
        for new_recognizer in raw_recognizers:
            patterns = []
            for pat in new_recognizer.patterns:
                patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
            new_custom_recognizer = PatternRecognizer(
                name=new_recognizer.name,
                supported_entity=new_recognizer.entity,
                supported_language=new_recognizer.language,
                black_list=new_recognizer.blacklist,
                context=new_recognizer.contextPhrases,
                patterns=patterns)
            custom_recognizers.append(new_custom_recognizer)

        return custom_recognizers
예제 #8
0
    def from_dict(cls, entity_recognizer_dict):
        patterns = entity_recognizer_dict.get("patterns")
        if patterns:
            patterns_list = [Pattern.from_dict(pat) for pat in patterns]
            entity_recognizer_dict['patterns'] = patterns_list

        return cls(**entity_recognizer_dict)
예제 #9
0
    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)
예제 #10
0
    def test_remove_pattern_recognizer(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])
        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Expects one custom recognizer
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0
예제 #11
0
 def __init__(self):
     patterns = [
         Pattern('IBAN Generic', IBAN_GENERIC_REGEX, IBAN_GENERIC_SCORE)
     ]
     super().__init__(supported_entity="IBAN_CODE",
                      patterns=patterns,
                      context=CONTEXT)
예제 #12
0
    def test_from_dict(self):
        expected = my_pattern
        actual = Pattern.from_dict(my_pattern_dict)

        assert expected.name == actual.name
        assert expected.score == actual.score
        assert expected.regex == actual.regex
예제 #13
0
    def test_context_custom_recognizer(self):
        nlp_engine = SpacyNlpEngine()
        mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en")

        # This test checks that a custom recognizer is also enhanced by context.
        # However this test also verifies a specific case in which the pattern also
        # includes a preceeding space (' rocket'). This in turn cause for a misalignment
        # between the tokens and the regex match (the token will be just 'rocket').
        # This misalignment is handled in order to find the correct context window.
        rocket_recognizer = PatternRecognizer(
            supported_entity="ROCKET",
            name="rocketrecognizer",
            context=["cool"],
            patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)])
        text = "hi, this is a cool ROCKET"
        recognizer = rocket_recognizer
        entities = ["ROCKET"]
        nlp_artifacts = nlp_engine.process_text(text, "en")
        results_without_context = recognizer.analyze(text, entities,
                                                     mock_nlp_artifacts)
        results_with_context = recognizer.analyze(text, entities,
                                                  nlp_artifacts)
        assert (len(results_without_context) == len(results_with_context))
        for i in range(len(results_with_context)):
            assert (results_without_context[i].score <
                    results_with_context[i].score)
예제 #14
0
    def __black_list_to_regex(black_list):
        """
        Converts a list of word to a matching regex, to be analyzed by the
         regex engine as a part of the analyze logic

        :param black_list: the list of words to detect
        :return:the regex of the words for detection
        """
        regex = r"(?:^|(?<= ))(" + '|'.join(black_list) + r")(?:(?= )|$)"
        return Pattern(name="black_list", regex=regex, score=1.0)
예제 #15
0
    def test_removed_pattern_recognizer_doesnt_work(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "spaceship is my favorite transportation"
        entities = ["CREDIT_CARD", "SPACESHIP"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)
        assert len(results) == 1
        assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
        # Test again to see we didn't get any results
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0
예제 #16
0
    def add_custom_pattern_recognizer(self, new_recognizer,
                                      skip_hash_update=False):
        patterns = []
        for pat in new_recognizer.patterns:
            patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
        new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0],
                                                  supported_language=new_recognizer.supported_language,
                                                  black_list=new_recognizer.black_list,
                                                  context=new_recognizer.context,
                                                  patterns=patterns)
        self.recognizers.append(new_custom_recognizer)

        if skip_hash_update:
            return

        m = hashlib.md5()
        for recognizer in self.recognizers:
            m.update(recognizer.name.encode('utf-8'))
        self.latest_hash = m.digest()
예제 #17
0
    def test_add_pattern_recognizer(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1
        assert recognizers[0].patterns[0].name == "rocket pattern"
        assert recognizers[0].name == "Rocket recognizer"
예제 #18
0
    def test_cache_logic(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Negative flow
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        # Nothing should be returned
        assert len(custom_recognizers) == 0
        # Since no hash was returned, then no access to storage is expected
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Add a new recognizer
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer,
            skip_hash_update=True)

        # Since the hash wasn't updated the recognizers are stale from the cache
        # without the newly added one
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 0
        # And we also didn't accessed the underlying storage
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Positive flow
        # Now do the same only this time update the hash so it should work properly
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        recognizer_registry.get_custom_recognizers()
        assert recognizers_store_api_mock.times_accessed_storage == 0
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer,
            skip_hash_update=False)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 1
        # Accessed again
        assert recognizers_store_api_mock.times_accessed_storage == 1
예제 #19
0
 def __init__(self):
     patterns = [Pattern('Passport (very weak)', VERY_WEAK_REGEX, 0.2)]
     super().__init__(supported_entity="US_PASSPORT",
                      patterns=patterns,
                      context=CONTEXT)
예제 #20
0
 def get_mock_pattern_recognizer(self, lang, entity, name):
     return PatternRecognizer(
         supported_entity=entity,
         supported_language=lang,
         name=name,
         patterns=[Pattern("pat", regex="REGEX", score=1.0)])
예제 #21
0
 def __init__(self):
     patterns = [Pattern('Email (Medium)', REGEX, 0.5)]
     super().__init__(supported_entity="EMAIL_ADDRESS",
                      patterns=patterns,
                      context=CONTEXT)
예제 #22
0
 def __init__(self):
     patterns = [Pattern('Crypto (Medium)', REGEX, 0.5)]
     super().__init__(supported_entity="CRYPTO", patterns=patterns,
                      context=CONTEXT)
예제 #23
0
 def __init__(self):
     patterns = [Pattern('All Credit Cards (weak)', REGEX, 0.2)]
     super().__init__(supported_entity="CREDIT_CARD",
                      patterns=patterns,
                      context=CONTEXT)
예제 #24
0
 def __init__(self):
     patterns = [Pattern('Domain ()', REGEX, 0.5)]
     super().__init__(supported_entity="DOMAIN_NAME",
                      patterns=patterns,
                      context=CONTEXT)
예제 #25
0
 def __init__(self):
     patterns = [Pattern('NHS (medium)', REGEX, 0.5)]
     super().__init__(supported_entity="UK_NHS",
                      patterns=patterns,
                      context=CONTEXT)
예제 #26
0
from unittest import TestCase

from analyzer import Pattern
my_pattern = Pattern(name="my pattern", score=0.9, regex="[re]")
my_pattern_dict = {"name": "my pattern", "regex": "[re]", "score": 0.9}


class TestPattern(TestCase):
    def test_to_dict(self):
        expected = my_pattern_dict
        actual = my_pattern.to_dict()

        assert expected == actual

    def test_from_dict(self):
        expected = my_pattern
        actual = Pattern.from_dict(my_pattern_dict)

        assert expected.name == actual.name
        assert expected.score == actual.score
        assert expected.regex == actual.regex
예제 #27
0
 def __init__(self):
     patterns = [Pattern('Bank Account (weak)', REGEX, 0.05)]
     super().__init__(supported_entity="US_BANK_NUMBER",
                      patterns=patterns,
                      context=CONTEXT)