def test_analyzer_with_generated_text(test_input, acceptance_threshold): """ Test analyzer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimim precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(test_input.format(dir_path)) updated_samples = Evaluator.align_entity_types( input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map ) analyzer = PresidioAnalyzerWrapper() evaluator = Evaluator(model=analyzer) evaluated_samples = evaluator.evaluate_all(updated_samples) scores = evaluator.calculate_score(evaluation_results=evaluated_samples) assert acceptance_threshold <= scores.pii_precision assert acceptance_threshold <= scores.pii_recall
def test_flair_simple(small_dataset): flair_model = FlairModel(model_path="ner", entities_to_keep=["PERSON"]) evaluator = Evaluator(model=flair_model) evaluation_results = evaluator.evaluate_all(small_dataset) scores = evaluator.calculate_score(evaluation_results) assert_model_results_gt(scores, "PERSON", 0)
def score_model( model: BaseModel, entities_to_keep: List[str], input_samples: List[InputSample], verbose: bool = False, beta: float = 2.5, ) -> EvaluationResult: """ Run data through a model and gather results and stats """ print("Evaluating samples") evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep) evaluated_samples = evaluator.evaluate_all(input_samples) print("Estimating metrics") evaluation_result = evaluator.calculate_score( evaluation_results=evaluated_samples, beta=beta) precision = evaluation_result.pii_precision recall = evaluation_result.pii_recall entity_recall = evaluation_result.entity_recall_dict entity_precision = evaluation_result.entity_precision_dict f = evaluation_result.pii_f errors = evaluation_result.model_errors # print(f"precision: {precision}") print(f"Recall: {recall}") print(f"F {beta}: {f}") print(f"Precision per entity: {entity_precision}") print(f"Recall per entity: {entity_recall}") if verbose: false_negatives = [ str(mistake) for mistake in errors if mistake.error_type == "FN" ] false_positives = [ str(mistake) for mistake in errors if mistake.error_type == "FP" ] other_mistakes = [ str(mistake) for mistake in errors if mistake.error_type not in ["FN", "FP"] ] print("False negatives: ") print("\n".join(false_negatives)) print("\n******************\n") print("False positives: ") print("\n".join(false_positives)) print("\n******************\n") print("Other mistakes: ") print("\n".join(other_mistakes)) return evaluation_result
def test_crf_simple(small_dataset): train_test_ratios = [0.7, 0.3] train, test = split_dataset(small_dataset, train_test_ratios) crf_model = CRFModel(model_pickle_path=None, entities_to_keep=["PERSON"]) crf_model.fit(train) evaluator = Evaluator(model=crf_model) evaluation_results = evaluator.evaluate_all(test) scores = evaluator.calculate_score(evaluation_results) assert_model_results_gt(scores, "PERSON", 0)
def test_dataset_to_metric_identity_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( "{}/data/generated_small.json".format(dir_path), length=10) model = IdentityTokensMockModel() evaluator = Evaluator(model=model) evaluation_results = evaluator.evaluate_all(input_samples) metrics = evaluator.calculate_score(evaluation_results) assert metrics.pii_precision == 1 assert metrics.pii_recall == 1
def test_evaluate_multiple_examples_correct_statistics(): prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] evaluated = evaluator.evaluate_all( [input_sample, input_sample, input_sample, input_sample]) scores = evaluator.calculate_score(evaluated) assert scores.pii_precision == 0.5 assert scores.pii_recall == 0.5
def test_spacy_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) spacy_model = SpacyModel(model_name="en_core_web_lg", entities_to_keep=['PERSON']) evaluator = Evaluator(model=spacy_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_dataset_to_metric_50_50_model(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( "{}/data/generated_small.json".format(dir_path), length=100) # Replace 50% of the predictions with a list of "O" model = FiftyFiftyIdentityTokensMockModel() evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) evaluation_results = evaluator.evaluate_all(input_samples) metrics = evaluator.calculate_score(evaluation_results) print(metrics.pii_precision) print(metrics.pii_recall) print(metrics.pii_f) assert metrics.pii_precision == 1 assert metrics.pii_recall < 0.75 assert metrics.pii_recall > 0.25
def test_flair_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) model = SequenceTagger.load("ner-ontonotes-fast") # .load('ner') flair_model = FlairModel(model=model, entities_to_keep=["PERSON"]) evaluator = Evaluator(model=flair_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict["PERSON"]) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict["PERSON"]) assert scores.pii_recall > 0 assert scores.pii_precision > 0
def test_test_crf_simple(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset( os.path.join(dir_path, "data/generated_small.txt")) model_path = os.path.abspath( os.path.join(dir_path, "..", "model-outputs/crf.pickle")) crf_model = CRFModel(model_pickle_path=model_path, entities_to_keep=['PERSON']) evaluator = Evaluator(model=crf_model) evaluation_results = evaluator.evaluate_all(input_samples) scores = evaluator.calculate_score(evaluation_results) np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) assert scores.pii_recall > 0 assert scores.pii_precision > 0