def test_get_evaluation_metrics(targets, predictions, expected_precision, expected_fscore, expected_accuracy): from rasa.model_testing import get_evaluation_metrics report, precision, f1, accuracy = get_evaluation_metrics( targets, predictions, True, exclude_label=NO_ENTITY) assert f1 == expected_fscore assert precision == expected_precision assert accuracy == expected_accuracy assert NO_ENTITY not in report
async def test( stories: Text, agent: "Agent", max_stories: Optional[int] = None, out_directory: Optional[Text] = None, fail_on_prediction_errors: bool = False, e2e: bool = False, disable_plotting: bool = False, successes: bool = False, errors: bool = True, warnings: bool = True, ) -> Dict[Text, Any]: """Run the evaluation of the stories, optionally plot the results. Args: stories: the stories to evaluate on agent: the agent max_stories: maximum number of stories to consider out_directory: path to directory to results to fail_on_prediction_errors: boolean indicating whether to fail on prediction errors or not e2e: boolean indicating whether to use end to end evaluation or not disable_plotting: boolean indicating whether to disable plotting or not successes: boolean indicating whether to write down successful predictions or not errors: boolean indicating whether to write down incorrect predictions or not warnings: boolean indicating whether to write down prediction warnings or not Returns: Evaluation summary. """ from rasa.model_testing import get_evaluation_metrics generator = _create_data_generator(stories, agent, max_stories, e2e) completed_trackers = generator.generate_story_trackers() story_evaluation, _, entity_results = await _collect_story_predictions( completed_trackers, agent, fail_on_prediction_errors, use_e2e=e2e) evaluation_store = story_evaluation.evaluation_store with pywarnings.catch_warnings(): from sklearn.exceptions import UndefinedMetricWarning pywarnings.simplefilter("ignore", UndefinedMetricWarning) targets, predictions = evaluation_store.serialise() if out_directory: report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) # Add conversation level accuracy to story report. num_failed = len(story_evaluation.failed_stories) num_correct = len(story_evaluation.successful_stories) num_warnings = len(story_evaluation.stories_with_warnings) num_convs = num_failed + num_correct if num_convs and isinstance(report, Dict): conv_accuracy = num_correct / num_convs report["conversation_accuracy"] = { "accuracy": conv_accuracy, "correct": num_correct, "with_warnings": num_warnings, "total": num_convs, } report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file( report_filename, report) logger.info(f"Stories report saved to {report_filename}.") else: report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True) evaluate_entities( entity_results, POLICIES_THAT_EXTRACT_ENTITIES, out_directory, successes, errors, disable_plotting, ) telemetry.track_core_model_test(len(generator.story_graph.story_steps), e2e, agent) _log_evaluation_table( evaluation_store.action_targets, "ACTION", action_accuracy, precision=precision, f1=f1, in_training_data_fraction=story_evaluation.in_training_data_fraction, ) if not disable_plotting and out_directory: _plot_story_evaluation( evaluation_store.action_targets, evaluation_store.action_predictions, out_directory, ) if errors and out_directory: _log_stories( story_evaluation.failed_stories, os.path.join(out_directory, FAILED_STORIES_FILE), "None of the test stories failed - all good!", ) if successes and out_directory: _log_stories( story_evaluation.successful_stories, os.path.join(out_directory, SUCCESSFUL_STORIES_FILE), "None of the test stories succeeded :(", ) if warnings and out_directory: _log_stories( story_evaluation.stories_with_warnings, os.path.join(out_directory, STORIES_WITH_WARNINGS_FILE), "No warnings for test stories", ) return { "report": report, "precision": precision, "f1": f1, "accuracy": action_accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, }
async def run_test_on_stories(stories_path: str, model_path: str, e2e: bool = False): """ Run tests on stories. Args: stories_path: path where test stories are present as YAML. model_path: Model path where model on which test has to be run is present. e2e: if True, end to end test is initiated where intent prediction is also done along with action prediction. Returns: dictionary with evaluation results """ from rasa.model_testing import get_evaluation_metrics from rasa.core.test import _create_data_generator, _collect_story_predictions from rasa.core.agent import Agent test_report = {} agent = Agent.load(model_path) generator = await _create_data_generator(stories_path, agent, use_conversation_test_files=e2e) completed_trackers = generator.generate_story_trackers() story_evaluation, _, _ = await _collect_story_predictions( completed_trackers, agent, use_e2e=e2e ) targets, predictions = story_evaluation.evaluation_store.serialise() report, precision, f1, accuracy = get_evaluation_metrics(targets, predictions, output_dict=True) failed_stories_summary = [] success_stories_summary = [] for story in story_evaluation.failed_stories: events_tracker = [] for event in story.events: events_tracker.append(vars(event)) failed_stories_summary.append({'name': story.sender_id, 'events': events_tracker}) for story in story_evaluation.successful_stories: events_tracker = [] for event in story.events: events_tracker.append(vars(event)) success_stories_summary.append({'name': story.sender_id, 'events': events_tracker}) num_failed = len(story_evaluation.failed_stories) num_correct = len(story_evaluation.successful_stories) num_warnings = len(story_evaluation.stories_with_warnings) num_convs = num_failed + num_correct if num_convs and isinstance(report, Dict): conv_accuracy = num_correct / num_convs test_report["conversation_accuracy"] = { "accuracy": conv_accuracy, "success_count": num_correct, "failure_count": num_failed, "total_count": num_convs, "with_warnings": num_warnings, } test_report.update({ # "report": report, "precision": precision, "f1": f1, "accuracy": accuracy, # "actions": story_evaluation.action_list, # "in_training_data_fraction": story_evaluation.in_training_data_fraction, # "is_end_to_end_evaluation": e2e, "failed_stories": failed_stories_summary, # "successful_stories": success_stories_summary, }) return test_report
def __evaluate_entities(entity_results, extractors: Set[Text]) -> Dict: """ Creates summary statistics for each entity extractor. Logs precision, recall, and F1 per entity type for each extractor. Args: entity_results: entity evaluation results extractors: entity extractors to consider Returns: dictionary with evaluation results """ from rasa.model_testing import get_evaluation_metrics from rasa.nlu.test import ( NO_ENTITY, align_all_entity_predictions, merge_labels, substitute_labels, collect_successful_entity_predictions, collect_incorrect_entity_predictions ) aligned_predictions = align_all_entity_predictions(entity_results, extractors) merged_targets = merge_labels(aligned_predictions) from rasa.shared.nlu.constants import NO_ENTITY_TAG merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY) result = {} for extractor in extractors: merged_predictions = merge_labels(aligned_predictions, extractor) merged_predictions = substitute_labels( merged_predictions, NO_ENTITY_TAG, NO_ENTITY ) report, precision, f1, accuracy = get_evaluation_metrics( merged_targets, merged_predictions, output_dict=False, exclude_label=NO_ENTITY, ) successes = collect_successful_entity_predictions( entity_results, merged_predictions, merged_targets ) errors = collect_incorrect_entity_predictions( entity_results, merged_predictions, merged_targets ) result[extractor] = { "total_count": len(successes) + len(errors), "success_count": len(successes), "failure_count": len(errors), "precision": precision, "f1_score": f1, "accuracy": accuracy, # 'successes': successes, 'errors': errors } return result