Exemple #1
0
    def pseudonymize(
        self,
        original_text: str,
        presidio_response: List[RecognizerResult],
        count: int,
    ):
        """

        :param original_text: str containing the original text
        :param presidio_response: list of results from Presidio, to be used to know where entities are
        :param count: number of perturbations to return
        :return: List[str] with fake perturbations of original text
        """

        presidio_response = sorted(presidio_response,
                                   key=lambda resp: resp.start)

        anonymizer_engine = AnonymizerEngine()
        anonymized_result = anonymizer_engine.anonymize(
            text=original_text, analyzer_results=presidio_response)

        templated_text = anonymized_result.text
        templated_text = templated_text.replace(">", "}}").replace("<", "{{")
        fake_texts = [
            self.parse(templated_text, add_spans=False) for _ in range(count)
        ]
        return fake_texts
Exemple #2
0
def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result(
):
    analyzer_results = [
        RecognizerResult(start=48, end=57, score=0.55, entity_type="SSN"),
        RecognizerResult(start=24, end=32, score=0.6, entity_type="FULL_NAME"),
        RecognizerResult(start=24, end=28, score=0.9,
                         entity_type="FIRST_NAME"),
        RecognizerResult(start=29, end=32, score=0.6, entity_type="LAST_NAME"),
        RecognizerResult(start=24, end=30, score=0.8, entity_type="NAME"),
        RecognizerResult(start=18, end=32, score=0.8, entity_type="BLA"),
        RecognizerResult(start=23, end=35, score=0.8, entity_type="BLA"),
        RecognizerResult(start=28, end=36, score=0.8, entity_type="BLA"),
        RecognizerResult(start=48,
                         end=57,
                         score=0.95,
                         entity_type="PHONE_NUMBER")
    ]

    operator_config = OperatorConfig("replace", {})
    operator_config.operator_name = ""
    engine = AnonymizerEngine()
    engine._operate = _operate
    result = engine.anonymize(
        "hello world, my name is Jane Doe. My number is: 034453334",
        analyzer_results, {"DEFAULT": operator_config})

    assert result.text == "Number: I am your new text!"
    assert len(result.items) == 1
    assert result.items[0].operator == "hash"
    assert result.items[0].entity_type == "type"
    assert result.items[0].start == 0
    assert result.items[0].end == 35
    assert result.items[0].text == "text"
Exemple #3
0
    def perturb(
        self,
        original_text: str,
        presidio_response: List[RecognizerResult],
        count: int,
        genders: List[str] = None,
        namesets: List[str] = None,
    ):
        """

        :param original_text: str containing the original text
        :param presidio_response: list of results from Presidio, to be used to know where entities are
        :param count: number of perturbations to return
        :param genders: gender valuse to use (options: 'female', 'male')
        :param namesets: name set values to use (options are values from the FakeNameGenerator NameSet column)
        :return: List[str] with fake perturbations of original text
        """

        presidio_response = sorted(presidio_response,
                                   key=lambda resp: resp.start)

        anonymizer_engine = AnonymizerEngine()
        anonymized_result = anonymizer_engine.anonymize(
            text=original_text, analyzer_results=presidio_response)

        text = anonymized_result.text
        text = text.replace(">", "}").replace("<", "{")

        self.templates = [text]
        return [
            sample.full_text for sample in self.sample_examples(
                count=count, genders=genders, namesets=namesets)
        ]
Exemple #4
0
def test_given_anonymize_called_with_multiple_scenarios_then_expected_results_returned(
    anonymize_scenario, ):
    anonymizer_request_dict = json.loads(
        get_scenario_file_content("anonymize",
                                  f"{anonymize_scenario}.in.json"))
    expected_anonymize_result_json = json.loads(
        get_scenario_file_content("anonymize",
                                  f"{anonymize_scenario}.out.json"))
    items = []
    for item in expected_anonymize_result_json["items"]:
        items.append(
            AnonymizedEntity(
                item["anonymizer"],
                item["entity_type"],
                item["start"],
                item["end"],
                item["anonymized_text"],
            ))
    expected_anonymize_result = AnonymizerResult(
        expected_anonymize_result_json["text"], items)
    engine = AnonymizerEngine()
    anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json(
        anonymizer_request_dict)
    analyzer_results = AnonymizerRequest.handle_analyzer_results_json(
        anonymizer_request_dict)

    try:
        actual_anonymize_result = engine.anonymize(
            anonymizer_request_dict.get("text"), analyzer_results,
            anonymizers_config)
    except Exception as e:
        actual_anonymize_result = str(e)

    assert actual_anonymize_result == expected_anonymize_result
Exemple #5
0
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully():
    text_to_test = "John Smith drivers license is AC432223"

    expected_response = [RecognizerResult("PERSON", 0, 10, 0.85),
                         RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999)
                         ]
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    analyzer = AnalyzerEngine(
        nlp_engine=nlp_engine,
        supported_languages=["en"]
    )
    analyzer_results = analyzer.analyze(text_to_test, "en")
    for i in range(len(analyzer_results)):
        assert analyzer_results[i] == expected_response[i]

    expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>")
    expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>"))
    expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>"))

    anonymizer = AnonymizerEngine()
    anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results)
    assert anonymizer_results == expected_response
def test_given_empty_text_to_engine_then_we_fail():
    engine = AnonymizerEngine()
    analyzer_result = RecognizerResult("SSN", 0, 1, 0.5)
    with pytest.raises(
        InvalidParamException, match="Invalid input, text can not be empty"
    ):
        engine.anonymize("", [analyzer_result], {})
class Server:
    """Flask server for anonymizer."""
    def __init__(self):
        fileConfig(Path(Path(__file__).parent, LOGGING_CONF_FILE))
        self.logger = logging.getLogger("presidio-anonymizer")
        self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
        self.app = Flask(__name__)
        self.logger.info("Starting anonymizer engine")
        self.engine = AnonymizerEngine()
        self.decryptor = AnonymizerDecryptor()
        self.logger.info(WELCOME_MESSAGE)

        @self.app.route("/health")
        def health() -> str:
            """Return basic health probe result."""
            return "Presidio Anonymizer service is up"

        @self.app.route("/anonymize", methods=["POST"])
        def anonymize():
            content = request.get_json()
            if not content:
                return ErrorResponse("Invalid request json").to_json(), 400

            anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json(
                content)
            analyzer_results = AnonymizerRequest.handle_analyzer_results_json(
                content)
            anoymizer_result = self.engine.anonymize(
                text=content.get("text"),
                analyzer_results=analyzer_results,
                anonymizers_config=anonymizers_config,
            )
            return anoymizer_result.to_json()

        @self.app.route("/decrypt", methods=["POST"])
        def decrypt() -> Union[str, Tuple[str, int]]:
            content = request.get_json()
            if not content:
                return ErrorResponse("Invalid request json").to_json(), 400
            decrypted_text = self.decryptor.decrypt(key=content.get("key"),
                                                    text=content.get("text"))
            return jsonify(result=decrypted_text)

        @self.app.route("/anonymizers", methods=["GET"])
        def anonymizers() -> Tuple[str, int]:
            """Return a list of supported anonymizers."""
            return json.dumps(self.engine.get_anonymizers()), 200

        @self.app.errorhandler(InvalidParamException)
        def invalid_param(err):
            self.logger.warning(
                f"failed to anonymize text with validation error: {err.err_msg}"
            )
            return ErrorResponse(err.err_msg).to_json(), 422

        @self.app.errorhandler(Exception)
        def server_error(e):
            self.logger.error(f"A fatal error occurred during execution: {e}")
            return ErrorResponse("Internal server error").to_json(), 500
    def __init__(self):
        SpacyRecognizer.ENTITIES = ["PERSON"]
        Replace.NEW_VALUE = 'replace_text'
        nlp_engine = SpacyNlpEngine()
        nlp_engine.nlp['en'] = spacy.load('en_core_web_lg', disable=["parser", "tagger", "lemmatizer"])

        self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer_engine = AnonymizerEngine()
Exemple #9
0
def test_given_analyzer_result_with_an_incorrect_text_positions_then_we_fail(
        original_text, start, end):
    engine = AnonymizerEngine()
    analyzer_result = RecognizerResult("type", start, end, 0.5)
    err_msg = (f"Invalid analyzer result, start: {start} and end: "
               f"{end}, while text length is only 11.")
    with pytest.raises(InvalidParamException, match=err_msg):
        engine.anonymize(original_text, [analyzer_result], {})
def test_given_default_anonymizer_then_we_use_it():
    engine = AnonymizerEngine()
    text = "please REPLACE ME."
    analyzer_result = AnalyzerResult("SSN", 7, 17, 0.8)
    anonymizer_config = AnonymizerConfig("replace",
                                         {"new_value": "and thank you"})
    result = engine.anonymize(text, [analyzer_result],
                              {"DEFAULT": anonymizer_config})
    assert result == "please and thank you."
def test_given_specific_anonymizer_then_we_use_it():
    engine = AnonymizerEngine()
    text = "please REPLACE ME."
    analyzer_result = RecognizerResult("SSN", 7, 17, 0.8)
    anonymizer_config = AnonymizerConfig("replace", {"new_value": "and thank you"})
    ssn_anonymizer_config = AnonymizerConfig("redact", {})
    result = engine.anonymize(
        text,
        [analyzer_result],
        {"DEFAULT": anonymizer_config, "SSN": ssn_anonymizer_config},
    ).text
    assert result == "please ."
def test_given_operator_decrypt_then_we_fail():
    text = "hello world, my name is Jane Doe. My number is: 03-4453334"
    anonymizers_config = {"DEFAULT": OperatorConfig("decrypt", {"key": "key"})}
    analyzer_results = [
        RecognizerResult(start=24, end=32, score=0.8, entity_type="NAME"),
    ]
    engine = AnonymizerEngine()
    with pytest.raises(
            InvalidParamException,
            match="Invalid operator class 'decrypt'.",
    ):
        engine.anonymize(text, analyzer_results, anonymizers_config)
def run_engine_and_validate(text: str, anonymizers_config, analyzer_results,
                            expected_result):
    engine = AnonymizerEngine()
    try:
        actual_anonymize_result = engine.anonymize(text, analyzer_results,
                                                   anonymizers_config)
    except Exception as e:
        actual_anonymize_result = str(e)
    print("********")
    print(actual_anonymize_result.to_json())
    print("********")
    assert actual_anonymize_result.to_json() == expected_result
def anonymize_text(text: str) -> str:
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()
    analyzer_results = analyzer.analyze(text=text, language="en")
    anonymized_results = anonymizer.anonymize(
        text=text,
        analyzer_results=analyzer_results,
        anonymizers_config={
            "DEFAULT": AnonymizerConfig("replace",
                                        {"new_value": "<ANONYMIZED>"})
        },
    )
    return anonymized_results
Exemple #15
0
    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.engine_config:
            self.engine_config = PresidioEngineConfig()

        if not self.engine_config.models or len(
                self.engine_config.models) == 0:
            self.engine_config.models = [PresidioModelConfig()]

        # If spacy engine then load Spacy models and select languages
        languages = []
        for model_config in self.engine_config.models:
            languages.append(model_config.lang_code)

            # Check SpacyNlpEngine.engine_name
            if (self.engine_config.nlp_engine_name == "spacy"
                    and model_config.model_name is not None):
                try:
                    spacy_model = __import__(model_config.model_name)
                    spacy_model.load()
                    logger.info(
                        f"Spacy model {model_config.model_name} is already downloaded"
                    )
                except:
                    logger.warning(
                        f"Spacy model {model_config.model_name} is not downloaded"
                    )
                    logger.warning(
                        f"Downloading spacy model {model_config.model_name}, it might take some time"
                    )
                    from spacy.cli import download

                    download(model_config.model_name)

        # Create NLP engine based on configuration
        provider = NlpEngineProvider(
            nlp_configuration=self.engine_config.dict())
        nlp_engine = provider.create_engine()

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                                        supported_languages=languages)

        # self._analyzer.registry.load_predefined_recognizers()
        if self.entity_recognizers:
            for entity_recognizer in self.entity_recognizers:
                self._analyzer.registry.add_recognizer(entity_recognizer)

        # Initialize the anonymizer with logger
        self._anonymizer = AnonymizerEngine()
def obfuscate(text):
    analyzer_results = analyze(text)
    anonymizer = AnonymizerEngine()
    anonymized_results = anonymizer.anonymize(
        text=text,
        analyzer_results=analyzer_results,
        anonymizers_config={"DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"}),
                            "PHONE_NUMBER": AnonymizerConfig("mask",
                                                             {"type": "mask", "masking_char": "*", "chars_to_mask": 12,
                                                              "from_end": True}),
                            }
    )
    output = anonymized_results
    return output
class HansardTextFormatter:
    def __init__(self):
        SpacyRecognizer.ENTITIES = ["PERSON"]
        Replace.NEW_VALUE = 'replace_text'
        nlp_engine = SpacyNlpEngine()
        nlp_engine.nlp['en'] = spacy.load(
            'en_core_web_lg', disable=["parser", "tagger", "lemmatizer"])

        self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer_engine = AnonymizerEngine()

    def run_anonymizer(self, text):
        results = self.analyzer_engine.analyze(text=text,
                                               entities=[],
                                               language='en',
                                               score_threshold=0.5)
        if results:
            config = {
                "PERSON":
                AnonymizerConfig("replace", {"replace_text": "[GDPRREDACT]"})
            }
            return self.anonymizer_engine.anonymize(text, results, config)

    @staticmethod
    def clean_text(text):
        text = text.replace('\n', '')
        text = text.replace('<BR />', '\n')
        return text

    def run_formatter(self, text):
        anon_text = self.run_anonymizer(text)
        cleaned_text = self.clean_text(anon_text)
        return cleaned_text
def anonymize_text(text: str) -> str:
    try:
        analyzer = AnalyzerEngine()
        anonymizer = AnonymizerEngine()
        analyzer_results = analyzer.analyze(text=text, language="en")
        anonymized_results = anonymizer.anonymize(
            text=text,
            analyzer_results=analyzer_results,
            operators={
                "DEFAULT":
                AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"})
            },
        )
        return anonymized_results.text
    except Exception as e:
        print(f"An exception occurred. {e}")
def test_given_anonymize_with_encrypt_then_text_returned_with_encrypted_content():
    unencrypted_text = "My name is "
    expected_encrypted_text = "Chloë"
    text = unencrypted_text + expected_encrypted_text
    start_index = 11
    end_index = 16
    key = "WmZq4t7w!z%C&F)J"
    analyzer_results = [RecognizerResult("PERSON", start_index, end_index, 0.8)]
    anonymizers_config = {"PERSON": OperatorConfig("encrypt", {"key": key})}

    actual_anonymize_result = (
        AnonymizerEngine().anonymize(text, analyzer_results, anonymizers_config)
    )

    assert len(actual_anonymize_result.items) == 1
    anonymized_entities = [
        AnonymizerResult.from_operator_result(actual_anonymize_result.items[0])
    ]
    engine = DeanonymizeEngine()
    decryption = engine.deanonymize(
        actual_anonymize_result.text, anonymized_entities,
        {"PERSON": OperatorConfig(Decrypt.NAME, {"key": key})}
    )
    assert decryption.text == "My name is Chloë"
    assert len(decryption.items) == 1
    assert decryption.items[0].text == "Chloë"
    assert decryption.items[0].end == 16
    assert decryption.items[0].start == 11
    assert decryption.items[0].entity_type == "PERSON"
class Presidio:
    def __init__(self):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()

    def analyze_and_anonymize(self, text) -> str:
        analyzer_results = self.analyzer.analyze(text=text, language='en')
        operators = {"DEFAULT": OperatorConfig("redact")}
        anonymizer_results = self.anonymizer.anonymize(
            text=text, analyzer_results=analyzer_results, operators=operators)

        return anonymizer_results.text
Exemple #21
0
def test_given_anonymize_called_with_error_scenarios_then_expected_errors_returned(
    anonymize_scenario, ):
    anonymizer_request_dict = json.loads(
        get_scenario_file_content("anonymize",
                                  f"{anonymize_scenario}.in.json"))
    expected_anonymize_result_json = json.loads(
        get_scenario_file_content("anonymize",
                                  f"{anonymize_scenario}.out.json"))
    engine = AnonymizerEngine()
    anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json(
        anonymizer_request_dict)
    analyzer_results = AnonymizerRequest.handle_analyzer_results_json(
        anonymizer_request_dict)

    try:
        actual_anonymize_result = engine.anonymize(
            anonymizer_request_dict.get("text"), analyzer_results,
            anonymizers_config)
    except Exception as e:
        actual_anonymize_result = str(e)

    assert actual_anonymize_result == expected_anonymize_result_json
def test_given_anonymize_called_with_error_scenarios_then_expected_errors_returned(
):
    text = "hello world, my name is Jane Doe. My number is: 03-4453334"
    anonymizers = {
        "PHONE_NUMBER":
        OperatorConfig("mask", {
            "masking_char": "non_character",
            "chars_to_mask": 6,
            "from_end": True
        })
    }
    analyzer_results = [RecognizerResult("PHONE_NUMBER", 48, 57, 0.95)]

    engine = AnonymizerEngine()

    try:
        actual_anonymize_result = engine.anonymize(text, analyzer_results,
                                                   anonymizers)
    except Exception as e:
        actual_anonymize_result = str(e)

    assert actual_anonymize_result == "Invalid input, masking_char must be a character"
def test_given_several_anonymizers_then_we_use_the_correct_one():
    analyzer_result = AnalyzerResult.from_json({
        "score": 0.5,
        "entity_type": "PHONE_NUMBER",
        "start": 8,
        "end": 18
    })
    anonymizer_config = AnonymizerConfig("replace", {})
    anonymizer_config.anonymizer_class = MockAnonymizer
    text = AnonymizerEngine().anonymize("Number: 0554555556",
                                        [analyzer_result],
                                        {"PHONE_NUMBER": anonymizer_config})
    assert text == "Number: I am your new text!"
Exemple #24
0
class PDM:
    def __init__(self, language='en'):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
        self.language = language

    def predict(self, text, entities_of_interest=ENTITIES_OF_INTEREST):
        t0 = time()
        analyzer_results = self.analyzer.analyze(text, entities=entities_of_interest, language=self.language)
        t1 = time()
        anonymized_results = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
        t2 = time()
        results = {'time_to_analyze': f'{t1-t0:.4f} seconds',
                   'time_to_anonymize': f'{t2-t1:.4f} seconds',
                   'anonymized_text': anonymized_results.text,
                   'detected_items': [{'start': item.start, 'end': item.end, 'entity_type': item.entity_type} for item in anonymized_results.items]}
        return results
Exemple #25
0
def test_given_anonymize_with_encrypt_then_text_returned_with_encrypted_content(
):
    unencrypted_text = "My name is "
    expected_encrypted_text = "Chloë"
    text = unencrypted_text + expected_encrypted_text
    start_index = 11
    end_index = 16
    key = "WmZq4t7w!z%C&F)J"
    analyzer_results = [
        RecognizerResult("PERSON", start_index, end_index, 0.8)
    ]
    anonymizers_config = {"PERSON": AnonymizerConfig("encrypt", {"key": key})}

    actual_anonymize_result = (AnonymizerEngine().anonymize(
        text, analyzer_results, anonymizers_config).text)

    assert actual_anonymize_result[:start_index] == unencrypted_text
    actual_encrypted_text = actual_anonymize_result[start_index:]
    assert actual_encrypted_text != expected_encrypted_text
    actual_decrypted_text = AESCipher.decrypt(key.encode(),
                                              actual_encrypted_text)
    assert actual_decrypted_text == expected_encrypted_text
    emails_dict = {}

if os.path.isfile(names_dict_pkl_path):
    names_dict = pickle.load( open(names_dict_pkl_path, "rb") )
else:
    names_dict = {}

# Define locale and language dictionaries
faker_locales_dict = {'UNITED STATES': 'en_US', 'ITALY': 'it_IT', 'GERMANY': 'de_DE'}



# Initialize Presidio's analyzer and anonymizer
# https://microsoft.github.io/presidio/supported_entities/
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Create a copy of the source dataset
df = dataset.copy()

# Apply the function anonymizeName for each value of the Name column
df.Name = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Name'], df['Country'])] )

# Apply the function anonymizeEmail for each value of the Email column
df.Email = pd.Series( [anonymizeEmail(text, country) for (text, country) in zip(df['Email'], df['Country'])] )

# Column Notes is 'object' data type as it contains lot of NaN and
# Pandas doesn't recognize it as string. So it has to be cast to string
# in order to be anonymized. Then replace it with its anonymization
df.Notes = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])] )
df.Notes = pd.Series( [anonymizeEmail(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])] )
Exemple #27
0
class PresidioPIIAnalyzer(BaseAnalyzer):
    _analyzer: AnalyzerEngine = PrivateAttr()
    _anonymizer: AnonymizerEngine = PrivateAttr()
    TYPE: str = "PresidioPII"
    engine_config: Optional[PresidioEngineConfig] = None
    # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
    # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/
    entity_recognizers: Optional[List[EntityRecognizer]] = None
    # To find more details refer https://microsoft.github.io/presidio/anonymizer/
    anonymizers_config: Optional[Dict[str, OperatorConfig]] = None

    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.engine_config:
            self.engine_config = PresidioEngineConfig()

        # If spacy engine then load Spacy models and select languages
        languages = []
        for model_config in self.engine_config.models:
            languages.append(model_config.lang_code)

            # Check SpacyNlpEngine.engine_name
            if self.engine_config.nlp_engine_name == "spacy":
                try:
                    spacy_model = __import__(model_config.model_name)
                    spacy_model.load()
                    logger.info(
                        f"Spacy model {model_config.model_name} is already downloaded"
                    )
                except:
                    logger.warning(
                        f"Spacy model {model_config.model_name} is not downloaded"
                    )
                    logger.warning(
                        f"Downloading spacy model {model_config.model_name}, it might take some time"
                    )
                    from spacy.cli import download

                    download(model_config.model_name)

        # Create NLP engine based on configuration
        provider = NlpEngineProvider(
            nlp_configuration=self.engine_config.dict())
        nlp_engine = provider.create_engine()

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                                        supported_languages=languages)

        # self._analyzer.registry.load_predefined_recognizers()
        if self.entity_recognizers:
            for entity_recognizer in self.entity_recognizers:
                self._analyzer.registry.add_recognizer(entity_recognizer)

        # Initialize the anonymizer with logger
        self._anonymizer = AnonymizerEngine()

    def analyze_input(
        self,
        source_response_list: List[AnalyzerRequest],
        analyzer_config: PresidioPIIAnalyzerConfig,
        language: Optional[str] = "en",
        **kwargs,
    ) -> List[AnalyzerResponse]:
        analyzer_output: List[AnalyzerResponse] = []

        for source_response in source_response_list:
            analyzer_result = self._analyzer.analyze(
                text=source_response.processed_text,
                entities=analyzer_config.entities,
                return_decision_process=analyzer_config.
                return_decision_process,
                language=language,
            )

            anonymized_result = None
            if not analyzer_config.analyze_only:
                anonymizers_config = (analyzer_config.anonymizers_config
                                      or self.anonymizers_config)

                if (source_response.processed_text is not None
                        and len(source_response.processed_text) > 0):
                    anonymized_result = self._anonymizer.anonymize(
                        text=source_response.processed_text,
                        operators=anonymizers_config,
                        analyzer_results=analyzer_result,
                    )

            if analyzer_config.replace_original_text and anonymized_result is not None:
                text = anonymized_result.text
            else:
                text = source_response.processed_text

            analyzer_output.append(
                AnalyzerResponse(
                    processed_text=text,
                    meta=source_response.meta,
                    segmented_data={
                        "analyzer_result":
                        [vars(result) for result in analyzer_result],
                        "anonymized_result":
                        None if not anonymized_result else
                        [vars(item) for item in anonymized_result.items],
                        "anonymized_text":
                        None
                        if not anonymized_result else anonymized_result.text,
                    },
                    source_name=source_response.source_name,
                ))

        return analyzer_output
Exemple #28
0
def test_given_none_as_anonymziers_list_then_we_fall_to_default():
    engine = AnonymizerEngine()
    text = "please REPLACE ME."
    analyzer_result = RecognizerResult("SSN", 7, 17, 0.8)
    result = engine.anonymize(text, [analyzer_result]).text
    assert result == "please <SSN>."
Exemple #29
0
def test_given_empty_analyzers_list_then_we_get_same_text_back():
    engine = AnonymizerEngine()
    text = "one two three"
    assert engine.anonymize(text, [], {}).text == text
Exemple #30
0
def test_given_request_anonymizers_return_list():
    engine = AnonymizerEngine()
    expected_list = ["hash", "mask", "redact", "replace", "encrypt"]
    anon_list = engine.get_anonymizers()

    assert anon_list == expected_list