def create_mock_pattern_recognizer(lang, entity, name):
    return PatternRecognizer(
        supported_entity=entity,
        supported_language=lang,
        name=name,
        patterns=[Pattern("pat", regex="REGEX", score=1.0)],
    )
    def test_remove_pattern_recognizer(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])
        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Expects one custom recognizer
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0
Example #3
0
def zip_code_deny_list_recognizer():
    regex = r"(\b\d{5}(?:\-\d{4})?\b)"
    zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01)
    zip_recognizer = PatternRecognizer(supported_entity="ZIP",
                                       deny_list=["999"],
                                       patterns=[zipcode_pattern])
    return zip_recognizer
Example #4
0
    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)
    def get_all_recognizers(self):
        """
        Returns a list of CustomRecognizer which were created from the
        recognizers stored in the underlying store
        """
        req = recognizers_store_pb2.RecognizersGetAllRequest()
        raw_recognizers = []

        try:
            raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers

        except grpc.RpcError:
            logger.info("Failed getting recognizers from the remote store. \
            Returning an empty list")
            return raw_recognizers

        custom_recognizers = []
        for new_recognizer in raw_recognizers:
            patterns = []
            for pat in new_recognizer.patterns:
                patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
            new_custom_recognizer = PatternRecognizer(
                name=new_recognizer.name,
                supported_entity=new_recognizer.entity,
                supported_language=new_recognizer.language,
                black_list=new_recognizer.blacklist,
                context=new_recognizer.contextPhrases,
                patterns=patterns)
            custom_recognizers.append(new_custom_recognizer)

        return custom_recognizers
Example #6
0
def test_when_context_custom_recognizer_then_succeed(nlp_engine,
                                                     mock_nlp_artifacts):
    """This test checks that a custom recognizer is also enhanced by context.

    However this test also verifies a specific case in which the pattern also
    includes a preceeding space (' rocket'). This in turn cause for a misalignment
    between the tokens and the regex match (the token will be just 'rocket').
    This misalignment is handled in order to find the correct context window.
    """
    rocket_recognizer = PatternRecognizer(
        supported_entity="ROCKET",
        name="rocketrecognizer",
        context=["cool"],
        patterns=[Pattern("rocketpattern", r"\\s+(rocket)", 0.3)],
    )
    text = "hi, this is a cool ROCKET"
    recognizer = rocket_recognizer
    entities = ["ROCKET"]
    nlp_artifacts = nlp_engine.process_text(text, "en")
    results_without_context = recognizer.analyze(text, entities,
                                                 mock_nlp_artifacts)
    results_with_context = recognizer.analyze(text, entities, nlp_artifacts)
    assert len(results_without_context) == len(results_with_context)
    for res_wo, res_w in zip(results_without_context, results_with_context):
        assert res_wo.score < res_w.score
def test_when_ad_hoc_deny_list_recognizer_is_added_then_result_contains_result(
    loaded_analyzer_engine, ):
    text = "Mr. John Smith's drivers license is AC432223"

    mr_recognizer = PatternRecognizer(supported_entity="MR",
                                      deny_list=["Mr.", "Mr"])

    responses = loaded_analyzer_engine.analyze(
        text=text, language="en", ad_hoc_recognizers=[mr_recognizer])

    detected_entities = [response.entity_type for response in responses]
    assert "MR" in detected_entities
Example #8
0
    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        In a sentence we could get a false positive at the end of our regex, were we
        want to find the IBAN but not the false positive at the end of the match.

        i.e. "I want my deposit in DE89370400440532013000 2 days from today."

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            matches = re.finditer(pattern.regex, text, flags=self.flags)

            for match in matches:
                for grp_num in reversed(range(1, len(match.groups()) + 1)):
                    start = match.span(0)[0]
                    end = (
                        match.span(grp_num)[1]
                        if match.span(grp_num)[1] > 0
                        else match.span(0)[1]
                    )
                    current_match = text[start:end]

                    # Skip empty results
                    if current_match == "":
                        continue

                    score = pattern.score

                    validation_result = self.validate_result(current_match)
                    description = PatternRecognizer.build_regex_explanation(
                        self.name, pattern.name, pattern.regex, score, validation_result
                    )
                    pattern_result = RecognizerResult(
                        self.supported_entities[0], start, end, score, description
                    )

                    if validation_result is not None:
                        if validation_result:
                            pattern_result.score = EntityRecognizer.MAX_SCORE
                        else:
                            pattern_result.score = EntityRecognizer.MIN_SCORE

                    if pattern_result.score > EntityRecognizer.MIN_SCORE:
                        results.append(pattern_result)
                        break

        return results
Example #9
0
 def __init__(self, req_data: Dict):
     self.text = req_data.get("text")
     self.language = req_data.get("language")
     self.entities = req_data.get("entities")
     self.correlation_id = req_data.get("correlation_id")
     self.score_threshold = req_data.get("score_threshold")
     self.return_decision_process = req_data.get("return_decision_process")
     ad_hoc_recognizers = req_data.get("ad_hoc_recognizers")
     self.ad_hoc_recognizers = []
     if ad_hoc_recognizers:
         self.ad_hoc_recognizers = [
             PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
         ]
def test_removed_pattern_recognizer_doesnt_work(unit_test_guid):
    pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("SPACESHIP",
                                           name="Spaceship recognizer",
                                           patterns=[pattern])

    # Make sure the analyzer doesn't get this entity
    recognizers_store_api_mock = RecognizerStoreApiMock()
    analyze_engine = AnalyzerEngine(
        registry=MockRecognizerRegistry(recognizers_store_api_mock),
        nlp_engine=NlpEngineMock(),
    )
    text = "spaceship is my favorite transportation"
    entities = ["CREDIT_CARD", "SPACESHIP"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizers_store_api_mock.add_custom_pattern_recognizer(
        pattern_recognizer)
    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )
    assert len(results) == 1
    assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

    # Remove recognizer
    recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
    # Test again to see we didn't get any results
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 0
Example #11
0
def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path,
                            entity_name, pattern,
                            score, num_of_examples, acceptance_threshold,
                            max_mistakes_number):
    """
        Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
        and another CSV values file with a custom entity
        :param pii_csv: input csv file location with the common entities
        :param ext_csv: input csv file location with custom entities
        :param utterances: template file location
        :param dictionary_path: vocabulary/dictionary file location
        :param entity_name: custom entity name
        :param pattern: recognizer pattern
        :param num_of_examples: number of samples to be used from dataset to test
        :param acceptance_threshold: minimim precision/recall
         allowed for tests to pass
    """

    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8')
    dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8')
    dictionary_path = dictionary_path.format(dir_path)
    ext_column_name = dfext.columns[0]

    def get_from_ext(i):
        index = i % dfext.shape[0]
        return dfext.iat[index, 0]

    # extend pii with ext data
    dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]

    # generate examples
    generator = FakeDataGenerator(fake_pii_csv_file=dfpii,
                                  utterances_file=utterances.format(dir_path),
                                  dictionary_path=dictionary_path)
    examples = generator.sample_examples(num_of_examples)

    pattern = Pattern("test pattern", pattern, score)
    pattern_recognizer = PatternRecognizer(entity_name,
                                           name="test recognizer",
                                           patterns=[pattern])

    scores = score_presidio_recognizer(
        pattern_recognizer, [entity_name], examples)
    if not np.isnan(scores.pii_f):
        assert acceptance_threshold <= scores.pii_f
    assert max_mistakes_number >= len(scores.model_errors)
    def test_from_dict(self):
        json = {'supported_entity': 'ENTITY_1',
                'supported_language': 'en',
                'patterns': [{'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}],
                'context': ['w1', 'w2', 'w3'],
                'version': "1.0"}

        new_recognizer = PatternRecognizer.from_dict(json)
        ### consider refactoring assertions
        assert new_recognizer.supported_entities == ['ENTITY_1']
        assert new_recognizer.supported_language == 'en'
        assert new_recognizer.patterns[0].name == 'p1'
        assert new_recognizer.patterns[0].score == 0.5
        assert new_recognizer.patterns[0].regex == '([0-9]{1,9})'
        assert new_recognizer.context == ['w1', 'w2', 'w3']
        assert new_recognizer.version == "1.0"
def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine):
    text = "Mr. John Smith's zip code is 10002"

    mr_recognizer = PatternRecognizer(supported_entity="MR",
                                      deny_list=["Mr.", "Mr"])
    responses1 = loaded_analyzer_engine.analyze(
        text=text, language="en", ad_hoc_recognizers=[mr_recognizer])
    responses2 = loaded_analyzer_engine.analyze(
        text=text,
        language="en",
        ad_hoc_recognizers=[mr_recognizer],
        entities=["PERSON"],
    )

    assert "MR" in [resp.entity_type for resp in responses1]
    assert "MR" not in [resp.entity_type for resp in responses2]
Example #14
0
def test_when_get_supported_fields_specific_language_then_return_single_result(
        loaded_registry, unit_test_guid, nlp_engine):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET",
        name="Rocket recognizer RU",
        patterns=[pattern],
        supported_language="ru",
    )

    analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine)
    analyzer.registry.add_recognizer(pattern_recognizer)
    entities = analyzer.get_supported_entities(language="ru")

    assert len(entities) == 1
    assert "ROCKET" in entities
def test_when_add_pattern_recognizer_then_item_added():
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET", name="Rocket recognizer", patterns=[pattern]
    )

    # Create an empty recognizer registry
    recognizer_registry = RecognizerRegistry(recognizers=[])
    assert len(recognizer_registry.recognizers) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizer_registry.add_recognizer(pattern_recognizer)

    assert len(recognizer_registry.recognizers) == 1
    assert recognizer_registry.recognizers[0].patterns[0].name == "rocket pattern"
    assert recognizer_registry.recognizers[0].name == "Rocket recognizer"
Example #16
0
def test_when_get_recognizers_then_returns_supported_language():
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET",
        name="Rocket recognizer RU",
        patterns=[pattern],
        supported_language="ru",
    )
    mock_recognizer_registry = RecognizerRegistryMock()
    mock_recognizer_registry.add_recognizer(pattern_recognizer)
    analyze_engine = AnalyzerEngine(
        registry=mock_recognizer_registry,
        nlp_engine=NlpEngineMock(),
    )
    response = analyze_engine.get_recognizers(language="ru")
    # there is only 1 mocked russian recognizer
    assert len(response) == 1
Example #17
0
    def test_get_recognizers_returns_supported_language(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer RU",
                                               patterns=[pattern],
                                               supported_language="ru")

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="ru")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there is only 1 mocked russian recognizer
        assert len(response) == 1
    def test_add_pattern_recognizer(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1
        assert recognizers[0].patterns[0].name == "rocket pattern"
        assert recognizers[0].name == "Rocket recognizer"
def test_when_remove_pattern_recognizer_then_item_removed():
    pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "SPACESHIP", name="Spaceship recognizer", patterns=[pattern]
    )
    # Create an empty recognizer registry
    recognizer_registry = RecognizerRegistry(recognizers=[])
    assert len(recognizer_registry.recognizers) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizer_registry.add_recognizer(pattern_recognizer)

    # Expects one custom recognizer
    assert len(recognizer_registry.recognizers) == 1

    # Remove recognizer
    recognizer_registry.remove_recognizer("Spaceship recognizer")

    # Expects zero custom recognizers
    assert len(recognizer_registry.recognizers) == 0
Example #20
0
def test_when_add_recognizer_then_also_outputs_others(nlp_engine):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET",
        name="Rocket recognizer",
        patterns=[pattern],
        supported_language="en",
    )
    registry = RecognizerRegistry()
    registry.add_recognizer(pattern_recognizer)
    registry.load_predefined_recognizers()

    assert len(registry.recognizers) > 1

    analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)

    text = "Michael Jones has a rocket"

    results = analyzer.analyze(text=text, language="en")
    assert len(results) == 2
Example #21
0
    def test_get_recognizers_returns_added_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()

        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers
        assert len(response) == 15
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16
    def test_from_dict_returns_instance(self):
        pattern1_dict = {'name': 'p1', 'score': 0.5, 'regex': '([0-9]{1,9})'}
        pattern2_dict = {'name': 'p2', 'score': 0.8, 'regex': '([0-9]{1,9})'}

        ent_rec_dict = {"supported_entity": "A",
                        "supported_language": "he",
                        "patterns": [pattern1_dict, pattern2_dict]
                        }
        pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict)

        assert pattern_recognizer.supported_entities == ["A"]
        assert pattern_recognizer.supported_language == "he"
        assert pattern_recognizer.version == "0.0.1"

        assert pattern_recognizer.patterns[0].name == "p1"
        assert pattern_recognizer.patterns[0].score == 0.5
        assert pattern_recognizer.patterns[0].regex == '([0-9]{1,9})'

        assert pattern_recognizer.patterns[1].name == "p2"
        assert pattern_recognizer.patterns[1].score == 0.8
        assert pattern_recognizer.patterns[1].regex == '([0-9]{1,9})'
Example #23
0
    def add_custom_pattern_recognizer(self,
                                      new_recognizer,
                                      skip_hash_update=False):
        patterns = []
        for pat in new_recognizer.patterns:
            patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
        new_custom_recognizer = PatternRecognizer(
            name=new_recognizer.name,
            supported_entity=new_recognizer.supported_entities[0],
            supported_language=new_recognizer.supported_language,
            black_list=new_recognizer.black_list,
            context=new_recognizer.context,
            patterns=patterns)
        self.recognizers.append(new_custom_recognizer)

        if skip_hash_update:
            return

        m = hashlib.md5()
        for recognizer in self.recognizers:
            m.update(recognizer.name.encode('utf-8'))
        self.latest_hash = m.digest()
Example #24
0
def test_from_dict_returns_instance():
    pattern1_dict = {"name": "p1", "score": 0.5, "regex": "([0-9]{1,9})"}
    pattern2_dict = {"name": "p2", "score": 0.8, "regex": "([0-9]{1,9})"}

    ent_rec_dict = {
        "supported_entity": "A",
        "supported_language": "he",
        "patterns": [pattern1_dict, pattern2_dict],
    }
    pattern_recognizer = PatternRecognizer.from_dict(ent_rec_dict)

    assert pattern_recognizer.supported_entities == ["A"]
    assert pattern_recognizer.supported_language == "he"
    assert pattern_recognizer.version == "0.0.1"

    assert pattern_recognizer.patterns[0].name == "p1"
    assert pattern_recognizer.patterns[0].score == 0.5
    assert pattern_recognizer.patterns[0].regex == "([0-9]{1,9})"

    assert pattern_recognizer.patterns[1].name == "p2"
    assert pattern_recognizer.patterns[1].score == 0.8
    assert pattern_recognizer.patterns[1].regex == "([0-9]{1,9})"
Example #25
0
def test_from_dict():
    json = {
        "supported_entity": "ENTITY_1",
        "supported_language": "en",
        "patterns": [{
            "name": "p1",
            "score": 0.5,
            "regex": "([0-9]{1,9})"
        }],
        "context": ["w1", "w2", "w3"],
        "version": "1.0",
    }

    new_recognizer = PatternRecognizer.from_dict(json)
    # consider refactoring assertions
    assert new_recognizer.supported_entities == ["ENTITY_1"]
    assert new_recognizer.supported_language == "en"
    assert new_recognizer.patterns[0].name == "p1"
    assert new_recognizer.patterns[0].score == 0.5
    assert new_recognizer.patterns[0].regex == "([0-9]{1,9})"
    assert new_recognizer.context == ["w1", "w2", "w3"]
    assert new_recognizer.version == "1.0"
    def test_cache_logic(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Negative flow
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        # Nothing should be returned
        assert len(custom_recognizers) == 0
        # Since no hash was returned, then no access to storage is expected
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Add a new recognizer
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer, skip_hash_update=True)

        # Since the hash wasn't updated the recognizers are stale from the cache
        # without the newly added one
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 0
        # And we also didn't accessed the underlying storage
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Positive flow
        # Now do the same only this time update the hash so it should work properly
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        recognizer_registry.get_custom_recognizers()
        assert recognizers_store_api_mock.times_accessed_storage == 0
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer, skip_hash_update=False)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 1
        # Accessed again
        assert recognizers_store_api_mock.times_accessed_storage == 1
Example #27
0
def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    mock_recognizer_registry = RecognizerRegistryMock()

    # Make sure the analyzer doesn't get this entity
    analyze_engine = AnalyzerEngine(
        registry=mock_recognizer_registry,
        nlp_engine=NlpEngineMock(),
    )
    text = "rocket is my favorite transportation"
    entities = ["CREDIT_CARD", "ROCKET"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    mock_recognizer_registry.add_recognizer(pattern_recognizer)

    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 1
    assert_result(results[0], "ROCKET", 0, 7, 0.8)
def test_get_recognizers_returns_custom():
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    recognizers_store_api_mock = RecognizerStoreApiMock()
    recognizers_store_api_mock.add_custom_pattern_recognizer(
        pattern_recognizer)
    analyze_engine = AnalyzerEngine(
        registry=MockRecognizerRegistry(recognizers_store_api_mock),
        nlp_engine=NlpEngineMock(),
    )
    request = RecognizersAllRequest(language="en")
    response = analyze_engine.GetAllRecognizers(request, None)
    # there are 15 predefined recognizers and one custom
    assert len(response) == 16
    rocket_recognizer = [
        recognizer for recognizer in response
        if recognizer.name == "Rocket recognizer"
        and recognizer.entities == ["ROCKET"] and recognizer.language == "en"
    ]
    assert len(rocket_recognizer) == 1
    def test_context_custom_recognizer(self):
        nlp_engine = TESTS_NLP_ENGINE
        mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en")

        # This test checks that a custom recognizer is also enhanced by context.
        # However this test also verifies a specific case in which the pattern also
        # includes a preceeding space (' rocket'). This in turn cause for a misalignment
        # between the tokens and the regex match (the token will be just 'rocket').
        # This misalignment is handled in order to find the correct context window.
        rocket_recognizer = PatternRecognizer(supported_entity="ROCKET",
                                              name="rocketrecognizer",
                                              context=["cool"],
                                              patterns=[Pattern("rocketpattern",
                                                                "\\s+(rocket)",
                                                                0.3)])
        text = "hi, this is a cool ROCKET"
        recognizer = rocket_recognizer
        entities = ["ROCKET"]
        nlp_artifacts = nlp_engine.process_text(text, "en")
        results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts)
        results_with_context = recognizer.analyze(text, entities, nlp_artifacts)
        assert(len(results_without_context) == len(results_with_context))
        for i in range(len(results_with_context)):
            assert(results_without_context[i].score < results_with_context[i].score)
Example #30
0
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

analyzer = AnalyzerEngine()

text1 = "Professor Plum, in the Dining Room, with the candlestick"

titles_list = [
    "Sir", "Ma'am", "Madam", "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Professor"
]
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=titles_list)
analyzer.registry.add_recognizer(titles_recognizer)

result = analyzer.analyze(text=text1, language='en')
print(f"\nDeny List result:\n {result}")

text2 = "I live in 510 Broad st."

numbers_pattern = Pattern(name="numbers_pattern", regex="\d+", score=0.5)
number_recognizer = PatternRecognizer(supported_entity="NUMBER",
                                      patterns=[numbers_pattern])

result = number_recognizer.analyze(text=text2, entities=["NUMBER"])
print(f"\nRegex result:\n {result}")