def test_get_evaluation_metrics(targets, predictions, expected_precision, expected_fscore, expected_accuracy): from rasa.test import get_evaluation_metrics report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, True, exclude_label=NO_ENTITY) assert f1 == expected_fscore assert precision == expected_precision assert accuracy == expected_accuracy assert NO_ENTITY not in report
async def test( stories: Text, agent: "Agent", max_stories: Optional[int] = None, out_directory: Optional[Text] = None, fail_on_prediction_errors: bool = False, e2e: bool = False, disable_plotting: bool = False, successes: bool = False, errors: bool = True, ) -> Dict[Text, Any]: """Run the evaluation of the stories, optionally plot the results. Args: stories: the stories to evaluate on agent: the agent max_stories: maximum number of stories to consider out_directory: path to directory to results to fail_on_prediction_errors: boolean indicating whether to fail on prediction errors or not e2e: boolean indicating whether to use end to end evaluation or not disable_plotting: boolean indicating whether to disable plotting or not successes: boolean indicating whether to write down successful predictions or not errors: boolean indicating whether to write down incorrect predictions or not Returns: Evaluation summary. """ from rasa.test import get_evaluation_metrics generator = await _create_data_generator(stories, agent, max_stories, e2e) completed_trackers = generator.generate_story_trackers() story_evaluation, _ = await _collect_story_predictions( completed_trackers, agent, fail_on_prediction_errors, e2e) evaluation_store = story_evaluation.evaluation_store with warnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning warnings.simplefilter("ignore", UndefinedMetricWarning) targets, predictions = evaluation_store.serialise() if out_directory: report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file( report_filename, report) logger.info(f"Stories report saved to {report_filename}.") else: report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) telemetry.track_core_model_test(len(generator.story_graph.story_steps), e2e, agent) _log_evaluation_table( evaluation_store.action_targets, "ACTION", report, precision, f1, accuracy, story_evaluation.in_training_data_fraction, include_report=False, ) if not disable_plotting and out_directory: _plot_story_evaluation( evaluation_store.action_targets, evaluation_store.action_predictions, out_directory, ) if errors and out_directory: _log_stories( story_evaluation.failed_stories, os.path.join(out_directory, FAILED_STORIES_FILE), ) if successes and out_directory: _log_stories( story_evaluation.successful_stories, os.path.join(out_directory, SUCCESSFUL_STORIES_FILE), ) return { "report": report, "precision": precision, "f1": f1, "accuracy": accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, }
async def _collect_story_predictions( completed_trackers: List["DialogueStateTracker"], agent: "Agent", fail_on_prediction_errors: bool = False, use_e2e: bool = False, ) -> Tuple[StoryEvaluation, int]: """Test the stories from a file, running them through the stored model.""" from rasa.test import get_evaluation_metrics from tqdm import tqdm story_eval_store = EvaluationStore() failed = [] success = [] correct_dialogues = [] number_of_stories = len(completed_trackers) logger.info(f"Evaluating {number_of_stories} stories\nProgress:") action_list = [] for tracker in tqdm(completed_trackers): ( tracker_results, predicted_tracker, tracker_actions, ) = await _predict_tracker_actions(tracker, agent, fail_on_prediction_errors, use_e2e) story_eval_store.merge_store(tracker_results) action_list.extend(tracker_actions) if tracker_results.has_prediction_target_mismatch(): # there is at least one wrong prediction failed.append(predicted_tracker) correct_dialogues.append(0) else: correct_dialogues.append(1) success.append(predicted_tracker) logger.info("Finished collecting predictions.") with warnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning warnings.simplefilter("ignore", UndefinedMetricWarning) report, precision, f1, accuracy = get_evaluation_metrics( [1] * len(completed_trackers), correct_dialogues) in_training_data_fraction = _in_training_data_fraction(action_list) _log_evaluation_table( [1] * len(completed_trackers), "END-TO-END" if use_e2e else "CONVERSATION", report, precision, f1, accuracy, in_training_data_fraction, include_report=False, ) return ( StoryEvaluation( evaluation_store=story_eval_store, failed_stories=failed, successful_stories=success, action_list=action_list, in_training_data_fraction=in_training_data_fraction, ), number_of_stories, )