def test_remove_pattern_recognizer(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])
        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Expects one custom recognizer
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0
Exemple #2
0
    def test_context_custom_recognizer(self):
        nlp_engine = SpacyNlpEngine()
        mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en")

        # This test checks that a custom recognizer is also enhanced by context.
        # However this test also verifies a specific case in which the pattern also
        # includes a preceeding space (' rocket'). This in turn cause for a misalignment
        # between the tokens and the regex match (the token will be just 'rocket').
        # This misalignment is handled in order to find the correct context window.
        rocket_recognizer = PatternRecognizer(
            supported_entity="ROCKET",
            name="rocketrecognizer",
            context=["cool"],
            patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)])
        text = "hi, this is a cool ROCKET"
        recognizer = rocket_recognizer
        entities = ["ROCKET"]
        nlp_artifacts = nlp_engine.process_text(text, "en")
        results_without_context = recognizer.analyze(text, entities,
                                                     mock_nlp_artifacts)
        results_with_context = recognizer.analyze(text, entities,
                                                  nlp_artifacts)
        assert (len(results_without_context) == len(results_with_context))
        for i in range(len(results_with_context)):
            assert (results_without_context[i].score <
                    results_with_context[i].score)
Exemple #3
0
    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)
    def get_all_recognizers(self):
        """
        Returns a list of CustomRecognizer which were created from the
        recognizers stored in the underlying store
        """
        req = recognizers_store_pb2.RecognizersGetAllRequest()
        raw_recognizers = []

        try:
            raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers

        except grpc.RpcError:
            logging.info("Failed getting recognizers from the remote store. \
            Returning an empty list")
            return raw_recognizers

        custom_recognizers = []
        for new_recognizer in raw_recognizers:
            patterns = []
            for pat in new_recognizer.patterns:
                patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
            new_custom_recognizer = PatternRecognizer(
                name=new_recognizer.name,
                supported_entity=new_recognizer.entity,
                supported_language=new_recognizer.language,
                black_list=new_recognizer.blacklist,
                context=new_recognizer.contextPhrases,
                patterns=patterns)
            custom_recognizers.append(new_custom_recognizer)

        return custom_recognizers
    def test_from_dict(self):
        json = {'supported_entity': 'ENTITY_1',
                'supported_language': 'en',
                'patterns': [{'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}],
                'context': ['w1', 'w2', 'w3'],
                'version': "1.0"}

        new_recognizer = PatternRecognizer.from_dict(json)
        ### consider refactoring assertions
        assert new_recognizer.supported_entities == ['ENTITY_1']
        assert new_recognizer.supported_language == 'en'
        assert new_recognizer.patterns[0].name == 'p1'
        assert new_recognizer.patterns[0].score == 0.5
        assert new_recognizer.patterns[0].regex == '([0-9]{1,9})'
        assert new_recognizer.context == ['w1', 'w2', 'w3']
        assert new_recognizer.version == "1.0"
Exemple #6
0
    def test_removed_pattern_recognizer_doesnt_work(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "spaceship is my favorite transportation"
        entities = ["CREDIT_CARD", "SPACESHIP"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)
        assert len(results) == 1
        assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
        # Test again to see we didn't get any results
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0
Exemple #7
0
    def add_custom_pattern_recognizer(self, new_recognizer,
                                      skip_hash_update=False):
        patterns = []
        for pat in new_recognizer.patterns:
            patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
        new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0],
                                                  supported_language=new_recognizer.supported_language,
                                                  black_list=new_recognizer.black_list,
                                                  context=new_recognizer.context,
                                                  patterns=patterns)
        self.recognizers.append(new_custom_recognizer)

        if skip_hash_update:
            return

        m = hashlib.md5()
        for recognizer in self.recognizers:
            m.update(recognizer.name.encode('utf-8'))
        self.latest_hash = m.digest()
    def test_add_pattern_recognizer(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1
        assert recognizers[0].patterns[0].name == "rocket pattern"
        assert recognizers[0].name == "Rocket recognizer"
Exemple #9
0
    def test_cache_logic(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Negative flow
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        # Nothing should be returned
        assert len(custom_recognizers) == 0
        # Since no hash was returned, then no access to storage is expected
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Add a new recognizer
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer,
            skip_hash_update=True)

        # Since the hash wasn't updated the recognizers are stale from the cache
        # without the newly added one
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 0
        # And we also didn't accessed the underlying storage
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Positive flow
        # Now do the same only this time update the hash so it should work properly
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        recognizer_registry.get_custom_recognizers()
        assert recognizers_store_api_mock.times_accessed_storage == 0
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer,
            skip_hash_update=False)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 1
        # Accessed again
        assert recognizers_store_api_mock.times_accessed_storage == 1
    def test_from_dict_returns_instance(self):
        pattern1_dict = {'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}
        pattern2_dict = {'name': 'p2', 'score': 0.8, 'regex': '([0-9]{1,9})'}

        ent_rec_dict = {"supported_entity": "A",
                        "supported_language": "he",
                        "patterns": [pattern1_dict, pattern2_dict]
                        }
        pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict)

        assert pattern_recognizer.supported_entities == ["A"]
        assert pattern_recognizer.supported_language == "he"
        assert pattern_recognizer.version == "0.0.1"

        assert pattern_recognizer.patterns[0].name == "p1"
        assert pattern_recognizer.patterns[0].score == 0.5
        assert pattern_recognizer.patterns[0].regex == '([0-9]{1,9})'

        assert pattern_recognizer.patterns[1].name == "p2"
        assert pattern_recognizer.patterns[1].score == 0.8
        assert pattern_recognizer.patterns[1].regex == '([0-9]{1,9})'
 def get_mock_pattern_recognizer(self, lang, entity, name):
     return PatternRecognizer(
         supported_entity=entity,
         supported_language=lang,
         name=name,
         patterns=[Pattern("pat", regex="REGEX", score=1.0)])