def test_analyzer_with_generated_text(test_input, acceptance_threshold): """ Test analyzer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(test_input.format(dir_path)) updated_samples = Evaluator.align_entity_types( input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map ) analyzer = PresidioAnalyzerWrapper() evaluator = Evaluator(model=analyzer) evaluated_samples = evaluator.evaluate_all(updated_samples) scores = evaluator.calculate_score(evaluation_results=evaluated_samples) assert acceptance_threshold <= scores.pii_precision assert acceptance_threshold <= scores.pii_recall
def test_to_spacy_all_entities(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) spacy_ver = InputSample.create_spacy_dataset(input_samples) assert len(spacy_ver) == len(input_samples)
def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path): if not path.exists("flair_train.txt"): train_samples = read_synth_dataset(train_samples_path) train_tagged = [sample for sample in train_samples if len(sample.spans) > 0] print("Kept {} train samples after removal of non-tagged samples".format(len(train_tagged))) train_data = InputSample.create_conll_dataset(train_tagged) self.to_flair(train_data, outfile="flair_train.txt") if not path.exists("flair_test.txt"): test_samples = read_synth_dataset(test_samples_path) test_data = InputSample.create_conll_dataset(test_samples) self.to_flair(test_data, outfile="flair_test.txt") if not path.exists("flair_val.txt"): val_samples = read_synth_dataset(val_samples_path) val_data = InputSample.create_conll_dataset(val_samples) self.to_flair(val_data, outfile="flair_val.txt")
def test_to_conll(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) conll = InputSample.create_conll_dataset(input_samples) sentences = conll['sentence'].unique() assert len(sentences) == len(input_samples)
def test_to_spach_json(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) spacy_ver = InputSample.create_spacy_json(input_samples) assert len(spacy_ver) == len(input_samples) assert 'id' in spacy_ver[0] assert 'paragraphs' in spacy_ver[0]
def test_to_spacy_all_entities_specific_entities(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) spacy_ver = InputSample.create_spacy_dataset(input_samples, entities=['PERSON']) spacy_ver_with_labels = [sample for sample in spacy_ver if len(sample[1]['entities'])] assert len(spacy_ver_with_labels) < len(input_samples) assert len(spacy_ver_with_labels) > 0
def test_dataset_to_metric_identity_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( "{}/data/generated_small.txt".format(dir_path), length=10) model = IdentityTokensMockModel() evaluation_results = model.evaluate_all(input_samples) metrics = model.calculate_score(evaluation_results) assert metrics.pii_precision == 1 assert metrics.pii_recall == 1
def no_test_test_crf_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) model_path = os.path.abspath(os.path.join(dir_path, "..", "model-outputs/crf.pickle")) crf_evaluator = CRFEvaluator(model_pickle_path=model_path,entities_to_keep=['PERSON']) evaluation_results = crf_evaluator.evaluate_all(input_samples) scores = crf_evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def no_unit_test_flair_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) model = SequenceTagger.load('ner-ontonotes-fast') # .load('ner') flair_evaluator = FlairEvaluator(model=model, entities_to_keep=['PERSON']) evaluation_results = flair_evaluator.evaluate_all(input_samples) scores = flair_evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold): """ Test spacy recognizer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( test_input.format(dir_path)) scores = score_presidio_recognizer( SpacyRecognizer(), ['PERSON'], input_samples, True) assert acceptance_threshold <= scores.pii_f
def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold): """ Test credit card recognizer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(test_input.format(dir_path)) scores = score_presidio_recognizer(CreditCardRecognizer(), 'CREDIT_CARD', input_samples) assert acceptance_threshold <= scores.pii_f
def test_spacy_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) spacy_evaluator = SpacyEvaluator(model_name="en_core_web_lg", entities_to_keep=['PERSON']) evaluation_results = spacy_evaluator.evaluate_all(input_samples) scores = spacy_evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_dataset_to_metric_50_50_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( "{}/data/generated_small.txt".format(dir_path), length=100) # Replace 50% of the predictions with a list of "O" model = FiftyFiftyIdentityTokensMockModel(entities_to_keep='PERSON') evaluation_results = model.evaluate_all(input_samples) metrics = model.calculate_score(evaluation_results) print(metrics.pii_precision) print(metrics.pii_recall) print(metrics.pii_f) assert metrics.pii_precision == 1 assert metrics.pii_recall < 0.75 assert metrics.pii_recall > 0.25
def test_generator_correct_output(): OUTPUT = "generated_test.txt" EXAMPLES = 3 import os dir_path = os.path.dirname(os.path.realpath(__file__)) fake_pii_csv = "{}/data/FakeNameGenerator.com_100.csv".format(dir_path) utterances_file = "{}/data/templates.txt".format(dir_path) dictionary = "{}/data/Dictionary_test.csv".format(dir_path) generate(fake_pii_csv=fake_pii_csv, utterances_file=utterances_file, dictionary_path=dictionary, output_file=OUTPUT, lower_case_ratio=0.3, num_of_examples=EXAMPLES) input_samples = read_synth_dataset(OUTPUT) for sample in input_samples: assert len(sample.tags) == len(sample.tokens)
def score_presidio_analyzer( input_samples: Optional[List[InputSample]] = None, entities_to_keep: Optional[List[str]] = None, labeling_scheme: str = "BILUO", verbose: bool = True, ) -> EvaluationResult: """""" if not input_samples: print("Reading dataset") input_samples = read_synth_dataset("../../data/synth_dataset.txt") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types(input_samples) flatten = lambda l: [item for sublist in l for item in sublist] from collections import Counter count_per_entity = Counter([ span.entity_type for span in flatten( [input_sample.spans for input_sample in updated_samples]) ]) if verbose: print("Count per entity:") print(count_per_entity) analyzer = PresidioAnalyzerWrapper(entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme) return score_model( model=analyzer, entities_to_keep=list(count_per_entity.keys()), input_samples=updated_samples, verbose=verbose, )
def score_presidio_recognizer( recognizer: EntityRecognizer, entities_to_keep: List[str], input_samples: Optional[List[InputSample]] = None, labeling_scheme: str = "BILUO", with_nlp_artifacts: bool = False, verbose: bool = False, ) -> EvaluationResult: """ Run data through one EntityRecognizer and gather results and stats """ if not input_samples: print("Reading dataset") input_samples = read_synth_dataset("../../data/synth_dataset.txt") else: input_samples = list(input_samples) print( "Preparing dataset by aligning entity names to Presidio's entity names" ) updated_samples = Evaluator.align_entity_types(input_samples) model = PresidioRecognizerWrapper( recognizer=recognizer, entities_to_keep=entities_to_keep, labeling_scheme=labeling_scheme, nlp_engine=SpacyNlpEngine(), with_nlp_artifacts=with_nlp_artifacts, ) return score_model( model=model, entities_to_keep=entities_to_keep, input_samples=updated_samples, verbose=verbose, )
# response_tags = span_to_tag( scheme=self.labeling_scheme, text=sample.full_text, start=starts, end=ends, tokens=sample.tokens, scores=scores, tag=tags, ) return response_tags if __name__ == "__main__": print("Reading dataset") input_samples = read_synth_dataset("../data/synth_dataset.txt") print( "Preparing dataset by aligning entity names to Presidio's entity names" ) # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity entities_mapping = { "PERSON": "PERSON", "EMAIL": "EMAIL_ADDRESS", "CREDIT_CARD": "CREDIT_CARD", "FIRST_NAME": "PERSON", "PHONE_NUMBER": "PHONE_NUMBER", "BIRTHDAY": "DATE_TIME", "DATE": "DATE_TIME", "DOMAIN": "DOMAIN",
tags.append(res.entity_type) scores.append(res.score) # response_tags = span_to_tag(scheme=self.labeling_scheme, text=sample.full_text, start=starts, end=ends, tokens=sample.tokens, scores=scores, tag=tags) return response_tags if __name__ == "__main__": print("Reading dataset") input_samples = read_synth_dataset("../data/synth_dataset.json") print("Preparing dataset by aligning entity names to Presidio's entity names") # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity entities_mapping = { 'PERSON': 'PERSON', 'EMAIL': 'EMAIL_ADDRESS', 'CREDIT_CARD': 'CREDIT_CARD', 'FIRST_NAME': 'PERSON', 'PHONE_NUMBER': 'PHONE_NUMBER', 'BIRTHDAY': 'DATE_TIME', 'DATE': 'DATE_TIME', 'DOMAIN': 'DOMAIN', 'CITY': 'LOCATION', 'ADDRESS': 'LOCATION',