def test_entity_evaluation_report(tmpdir_factory): class EntityExtractorA(EntityExtractor): provides = ["entities"] def __init__(self, component_config=None) -> None: super().__init__(component_config) class EntityExtractorB(EntityExtractor): provides = ["entities"] def __init__(self, component_config=None) -> None: super().__init__(component_config) path = tmpdir_factory.mktemp("evaluation").strpath report_folder = os.path.join(path, "reports") report_filename_a = os.path.join(report_folder, "EntityExtractorA_report.json") report_filename_b = os.path.join(report_folder, "EntityExtractorB_report.json") rasa.utils.io.create_directory(report_folder) mock_interpreter = Interpreter( [ EntityExtractorA({"provides": ["entities"]}), EntityExtractorB({"provides": ["entities"]}), ], None, ) extractors = get_entity_extractors(mock_interpreter) result = evaluate_entities( [EN_entity_result], extractors, report_folder, errors=True, successes=True, disable_plotting=False, ) report_a = json.loads(rasa.utils.io.read_file(report_filename_a)) report_b = json.loads(rasa.utils.io.read_file(report_filename_b)) assert len(report_a) == 6 assert report_a["datetime"]["support"] == 1.0 assert report_b["macro avg"]["recall"] == 0.0 assert report_a["macro avg"]["recall"] == 0.5 assert result["EntityExtractorA"]["accuracy"] == 0.75 assert os.path.exists( os.path.join(report_folder, "EntityExtractorA_confusion_matrix.png")) assert os.path.exists( os.path.join(report_folder, "EntityExtractorA_errors.json")) assert os.path.exists( os.path.join(report_folder, "EntityExtractorA_successes.json")) assert not os.path.exists( os.path.join(report_folder, "EntityExtractorA_histogram.png"))
def test_entity_evaluation_report(tmpdir_factory): path = tmpdir_factory.mktemp("evaluation").strpath report_folder = os.path.join(path, "reports") mock_extractors = ["A", "B"] report_filename_a = os.path.join(report_folder, "A_report.json") report_filename_b = os.path.join(report_folder, "B_report.json") ner_filename = os.path.join(report_folder, "ner_filename.json") utils.create_dir(report_folder) result = evaluate_entities( [EN_targets], [EN_predicted], [EN_tokens], mock_extractors, report_folder, ner_filename, ) report_a = json.loads(rasa.utils.io.read_file(report_filename_a)) report_b = json.loads(rasa.utils.io.read_file(report_filename_b)) assert len(report_a) == 8 assert report_a["datetime"]["support"] == 1.0 assert report_b["macro avg"]["recall"] == 0.2 assert result["A"]["accuracy"] == 0.75
def test_entity_evaluation_report(tmpdir_factory): class EntityExtractorA(EntityExtractor): provides = ["entities"] def __init__(self, component_config=None) -> None: super(EntityExtractorA, self).__init__(component_config) class EntityExtractorB(EntityExtractor): provides = ["entities"] def __init__(self, component_config=None) -> None: super(EntityExtractorB, self).__init__(component_config) path = tmpdir_factory.mktemp("evaluation").strpath report_folder = os.path.join(path, "reports") report_filename_a = os.path.join(report_folder, "EntityExtractorA_report.json") report_filename_b = os.path.join(report_folder, "EntityExtractorB_report.json") utils.create_dir(report_folder) mock_interpreter = Interpreter( [ EntityExtractorA({"provides": ["entities"]}), EntityExtractorB({"provides": ["entities"]}), ], None, ) extractors = get_entity_extractors(mock_interpreter) result = evaluate_entities([EN_entity_result], extractors, report_folder) report_a = json.loads(rasa.utils.io.read_file(report_filename_a)) report_b = json.loads(rasa.utils.io.read_file(report_filename_b)) assert len(report_a) == 8 assert report_a["datetime"]["support"] == 1.0 assert report_b["macro avg"]["recall"] == 0.2 assert result["EntityExtractorA"]["accuracy"] == 0.75
def test_entity_evaluation_report(tmp_path: Path): path = tmp_path / "evaluation" path.mkdir() report_folder = str(path / "reports") report_filename_a = os.path.join(report_folder, "EntityExtractorA_report.json") report_filename_b = os.path.join(report_folder, "EntityExtractorB_report.json") rasa.shared.utils.io.create_directory(report_folder) extractors = _get_active_entity_extractors([EN_entity_result]) result = evaluate_entities( [EN_entity_result], extractors, report_folder, errors=True, successes=True, disable_plotting=False, ) report_a = json.loads(rasa.shared.utils.io.read_file(report_filename_a)) report_b = json.loads(rasa.shared.utils.io.read_file(report_filename_b)) assert len(report_a) == 6 assert report_a["datetime"]["support"] == 1.0 assert report_b["macro avg"]["recall"] == 0.0 assert report_a["macro avg"]["recall"] == 0.5 assert result["EntityExtractorA"]["accuracy"] == 0.75 assert os.path.exists( os.path.join(report_folder, "EntityExtractorA_confusion_matrix.png") ) assert os.path.exists(os.path.join(report_folder, "EntityExtractorA_errors.json")) assert os.path.exists( os.path.join(report_folder, "EntityExtractorA_successes.json") ) assert not os.path.exists( os.path.join(report_folder, "EntityExtractorA_histogram.png") )
async def test( stories: Text, agent: "Agent", max_stories: Optional[int] = None, out_directory: Optional[Text] = None, fail_on_prediction_errors: bool = False, e2e: bool = False, disable_plotting: bool = False, successes: bool = False, errors: bool = True, warnings: bool = True, ) -> Dict[Text, Any]: """Run the evaluation of the stories, optionally plot the results. Args: stories: the stories to evaluate on agent: the agent max_stories: maximum number of stories to consider out_directory: path to directory to results to fail_on_prediction_errors: boolean indicating whether to fail on prediction errors or not e2e: boolean indicating whether to use end to end evaluation or not disable_plotting: boolean indicating whether to disable plotting or not successes: boolean indicating whether to write down successful predictions or not errors: boolean indicating whether to write down incorrect predictions or not warnings: boolean indicating whether to write down prediction warnings or not Returns: Evaluation summary. """ from rasa.model_testing import get_evaluation_metrics generator = _create_data_generator(stories, agent, max_stories, e2e) completed_trackers = generator.generate_story_trackers() story_evaluation, _, entity_results = await _collect_story_predictions( completed_trackers, agent, fail_on_prediction_errors, use_e2e=e2e) evaluation_store = story_evaluation.evaluation_store with pywarnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning pywarnings.simplefilter("ignore", UndefinedMetricWarning) targets, predictions = evaluation_store.serialise() if out_directory: report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) # Add conversation level accuracy to story report. num_failed = len(story_evaluation.failed_stories) num_correct = len(story_evaluation.successful_stories) num_warnings = len(story_evaluation.stories_with_warnings) num_convs = num_failed + num_correct if num_convs and isinstance(report, Dict): conv_accuracy = num_correct / num_convs report["conversation_accuracy"] = { "accuracy": conv_accuracy, "correct": num_correct, "with_warnings": num_warnings, "total": num_convs, } report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file( report_filename, report) logger.info(f"Stories report saved to {report_filename}.") else: report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) evaluate_entities( entity_results, POLICIES_THAT_EXTRACT_ENTITIES, out_directory, successes, errors, disable_plotting, ) telemetry.track_core_model_test(len(generator.story_graph.story_steps), e2e, agent) _log_evaluation_table( evaluation_store.action_targets, "ACTION", action_accuracy, precision=precision, f1=f1, in_training_data_fraction=story_evaluation.in_training_data_fraction, ) if not disable_plotting and out_directory: _plot_story_evaluation( evaluation_store.action_targets, evaluation_store.action_predictions, out_directory, ) if errors and out_directory: _log_stories( story_evaluation.failed_stories, os.path.join(out_directory, FAILED_STORIES_FILE), "None of the test stories failed - all good!", ) if successes and out_directory: _log_stories( story_evaluation.successful_stories, os.path.join(out_directory, SUCCESSFUL_STORIES_FILE), "None of the test stories succeeded :(", ) if warnings and out_directory: _log_stories( story_evaluation.stories_with_warnings, os.path.join(out_directory, STORIES_WITH_WARNINGS_FILE), "No warnings for test stories", ) return { "report": report, "precision": precision, "f1": f1, "accuracy": action_accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, }