Esempio n. 1
0
def test_analyzer_with_generated_text(test_input, acceptance_threshold):
    """
        Test analyzer with a generated dataset text file
        :param test_input: input text file location
        :param acceptance_threshold: minimim precision/recall
         allowed for tests to pass
    """
    # read test input from generated file

    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(test_input.format(dir_path))

    updated_samples = Evaluator.align_entity_types(
        input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
    )

    analyzer = PresidioAnalyzerWrapper()
    evaluator = Evaluator(model=analyzer)
    evaluated_samples = evaluator.evaluate_all(updated_samples)
    scores = evaluator.calculate_score(evaluation_results=evaluated_samples)

    assert acceptance_threshold <= scores.pii_precision
    assert acceptance_threshold <= scores.pii_recall
Esempio n. 2
0
def test_to_spacy_all_entities():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_dataset(input_samples)

    assert len(spacy_ver) == len(input_samples)
Esempio n. 3
0
    def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path):
        if not path.exists("flair_train.txt"):
            train_samples = read_synth_dataset(train_samples_path)
            train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]
            print("Kept {} train samples after removal of non-tagged samples".format(len(train_tagged)))
            train_data = InputSample.create_conll_dataset(train_tagged)
            self.to_flair(train_data, outfile="flair_train.txt")

        if not path.exists("flair_test.txt"):
            test_samples = read_synth_dataset(test_samples_path)
            test_data = InputSample.create_conll_dataset(test_samples)
            self.to_flair(test_data, outfile="flair_test.txt")

        if not path.exists("flair_val.txt"):
            val_samples = read_synth_dataset(val_samples_path)
            val_data = InputSample.create_conll_dataset(val_samples)
            self.to_flair(val_data, outfile="flair_val.txt")
Esempio n. 4
0
def test_to_conll():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    conll = InputSample.create_conll_dataset(input_samples)

    sentences = conll['sentence'].unique()
    assert len(sentences) == len(input_samples)
Esempio n. 5
0
def test_to_spach_json():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_json(input_samples)

    assert len(spacy_ver) == len(input_samples)
    assert 'id' in spacy_ver[0]
    assert 'paragraphs' in spacy_ver[0]
Esempio n. 6
0
def test_to_spacy_all_entities_specific_entities():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_dataset(input_samples, entities=['PERSON'])

    spacy_ver_with_labels = [sample for sample in spacy_ver if len(sample[1]['entities'])]

    assert len(spacy_ver_with_labels) < len(input_samples)
    assert len(spacy_ver_with_labels) > 0
Esempio n. 7
0
def test_dataset_to_metric_identity_model():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(
        "{}/data/generated_small.txt".format(dir_path), length=10)

    model = IdentityTokensMockModel()

    evaluation_results = model.evaluate_all(input_samples)
    metrics = model.calculate_score(evaluation_results)

    assert metrics.pii_precision == 1
    assert metrics.pii_recall == 1
Esempio n. 8
0
def no_test_test_crf_simple():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    model_path = os.path.abspath(os.path.join(dir_path, "..", "model-outputs/crf.pickle"))

    crf_evaluator = CRFEvaluator(model_pickle_path=model_path,entities_to_keep=['PERSON'])
    evaluation_results = crf_evaluator.evaluate_all(input_samples)
    scores = crf_evaluator.calculate_score(evaluation_results)

    np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON'])
    np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON'])
    assert scores.pii_recall > 0
    assert scores.pii_precision > 0
Esempio n. 9
0
def no_unit_test_flair_simple():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    model = SequenceTagger.load('ner-ontonotes-fast')  # .load('ner')

    flair_evaluator = FlairEvaluator(model=model, entities_to_keep=['PERSON'])
    evaluation_results = flair_evaluator.evaluate_all(input_samples)
    scores = flair_evaluator.calculate_score(evaluation_results)

    np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON'])
    np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON'])
    assert scores.pii_recall > 0
    assert scores.pii_precision > 0
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
    """
        Test spacy recognizer with a generated dataset text file
        :param test_input: input text file location
        :param acceptance_threshold: minimim precision/recall
         allowed for tests to pass
    """

    # read test input from generated file
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(
        test_input.format(dir_path))
    scores = score_presidio_recognizer(
        SpacyRecognizer(), ['PERSON'], input_samples, True)
    assert acceptance_threshold <= scores.pii_f
def test_credit_card_recognizer_with_generated_text(test_input,
                                                    acceptance_threshold):
    """
        Test credit card recognizer with a generated dataset text file
        :param test_input: input text file location
        :param acceptance_threshold: minimim precision/recall
         allowed for tests to pass
    """

    # read test input from generated file
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(test_input.format(dir_path))
    scores = score_presidio_recognizer(CreditCardRecognizer(), 'CREDIT_CARD',
                                       input_samples)
    assert acceptance_threshold <= scores.pii_f
def test_spacy_simple():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(
        os.path.join(dir_path, "data/generated_small.txt"))

    spacy_evaluator = SpacyEvaluator(model_name="en_core_web_lg",
                                     entities_to_keep=['PERSON'])
    evaluation_results = spacy_evaluator.evaluate_all(input_samples)
    scores = spacy_evaluator.calculate_score(evaluation_results)

    np.testing.assert_almost_equal(scores.pii_precision,
                                   scores.entity_precision_dict['PERSON'])
    np.testing.assert_almost_equal(scores.pii_recall,
                                   scores.entity_recall_dict['PERSON'])
    assert scores.pii_recall > 0
    assert scores.pii_precision > 0
Esempio n. 13
0
def test_dataset_to_metric_50_50_model():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(
        "{}/data/generated_small.txt".format(dir_path), length=100)

    # Replace 50% of the predictions with a list of "O"
    model = FiftyFiftyIdentityTokensMockModel(entities_to_keep='PERSON')

    evaluation_results = model.evaluate_all(input_samples)
    metrics = model.calculate_score(evaluation_results)

    print(metrics.pii_precision)
    print(metrics.pii_recall)
    print(metrics.pii_f)

    assert metrics.pii_precision == 1
    assert metrics.pii_recall < 0.75
    assert metrics.pii_recall > 0.25
Esempio n. 14
0
def test_generator_correct_output():
    OUTPUT = "generated_test.txt"
    EXAMPLES = 3

    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    fake_pii_csv = "{}/data/FakeNameGenerator.com_100.csv".format(dir_path)
    utterances_file = "{}/data/templates.txt".format(dir_path)
    dictionary = "{}/data/Dictionary_test.csv".format(dir_path)

    generate(fake_pii_csv=fake_pii_csv,
             utterances_file=utterances_file,
             dictionary_path=dictionary,
             output_file=OUTPUT,
             lower_case_ratio=0.3,
             num_of_examples=EXAMPLES)

    input_samples = read_synth_dataset(OUTPUT)

    for sample in input_samples:
        assert len(sample.tags) == len(sample.tokens)
Esempio n. 15
0
def score_presidio_analyzer(
    input_samples: Optional[List[InputSample]] = None,
    entities_to_keep: Optional[List[str]] = None,
    labeling_scheme: str = "BILUO",
    verbose: bool = True,
) -> EvaluationResult:
    """"""
    if not input_samples:
        print("Reading dataset")
        input_samples = read_synth_dataset("../../data/synth_dataset.txt")
    else:
        input_samples = list(input_samples)

    print(
        "Preparing dataset by aligning entity names to Presidio's entity names"
    )

    updated_samples = Evaluator.align_entity_types(input_samples)

    flatten = lambda l: [item for sublist in l for item in sublist]
    from collections import Counter

    count_per_entity = Counter([
        span.entity_type for span in flatten(
            [input_sample.spans for input_sample in updated_samples])
    ])
    if verbose:
        print("Count per entity:")
        print(count_per_entity)
    analyzer = PresidioAnalyzerWrapper(entities_to_keep=entities_to_keep,
                                       labeling_scheme=labeling_scheme)

    return score_model(
        model=analyzer,
        entities_to_keep=list(count_per_entity.keys()),
        input_samples=updated_samples,
        verbose=verbose,
    )
Esempio n. 16
0
def score_presidio_recognizer(
    recognizer: EntityRecognizer,
    entities_to_keep: List[str],
    input_samples: Optional[List[InputSample]] = None,
    labeling_scheme: str = "BILUO",
    with_nlp_artifacts: bool = False,
    verbose: bool = False,
) -> EvaluationResult:
    """
    Run data through one EntityRecognizer and gather results and stats
    """

    if not input_samples:
        print("Reading dataset")
        input_samples = read_synth_dataset("../../data/synth_dataset.txt")
    else:
        input_samples = list(input_samples)

    print(
        "Preparing dataset by aligning entity names to Presidio's entity names"
    )

    updated_samples = Evaluator.align_entity_types(input_samples)

    model = PresidioRecognizerWrapper(
        recognizer=recognizer,
        entities_to_keep=entities_to_keep,
        labeling_scheme=labeling_scheme,
        nlp_engine=SpacyNlpEngine(),
        with_nlp_artifacts=with_nlp_artifacts,
    )
    return score_model(
        model=model,
        entities_to_keep=entities_to_keep,
        input_samples=updated_samples,
        verbose=verbose,
    )
Esempio n. 17
0
        #
        response_tags = span_to_tag(
            scheme=self.labeling_scheme,
            text=sample.full_text,
            start=starts,
            end=ends,
            tokens=sample.tokens,
            scores=scores,
            tag=tags,
        )
        return response_tags


if __name__ == "__main__":
    print("Reading dataset")
    input_samples = read_synth_dataset("../data/synth_dataset.txt")

    print(
        "Preparing dataset by aligning entity names to Presidio's entity names"
    )

    # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
    entities_mapping = {
        "PERSON": "PERSON",
        "EMAIL": "EMAIL_ADDRESS",
        "CREDIT_CARD": "CREDIT_CARD",
        "FIRST_NAME": "PERSON",
        "PHONE_NUMBER": "PHONE_NUMBER",
        "BIRTHDAY": "DATE_TIME",
        "DATE": "DATE_TIME",
        "DOMAIN": "DOMAIN",
                tags.append(res.entity_type)
                scores.append(res.score)
        #
        response_tags = span_to_tag(scheme=self.labeling_scheme,
                                    text=sample.full_text,
                                    start=starts,
                                    end=ends,
                                    tokens=sample.tokens,
                                    scores=scores,
                                    tag=tags)
        return response_tags


if __name__ == "__main__":
    print("Reading dataset")
    input_samples = read_synth_dataset("../data/synth_dataset.json")

    print("Preparing dataset by aligning entity names to Presidio's entity names")

    # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
    entities_mapping = {
        'PERSON': 'PERSON',
        'EMAIL': 'EMAIL_ADDRESS',
        'CREDIT_CARD': 'CREDIT_CARD',
        'FIRST_NAME': 'PERSON',
        'PHONE_NUMBER': 'PHONE_NUMBER',
        'BIRTHDAY': 'DATE_TIME',
        'DATE': 'DATE_TIME',
        'DOMAIN': 'DOMAIN',
        'CITY': 'LOCATION',
        'ADDRESS': 'LOCATION',