Exemple #1
0
def test_get_evaluation_metrics(targets, predictions, expected_precision,
                                expected_fscore, expected_accuracy):
    from rasa.model_testing import get_evaluation_metrics

    report, precision, f1, accuracy = get_evaluation_metrics(
        targets, predictions, True, exclude_label=NO_ENTITY)

    assert f1 == expected_fscore
    assert precision == expected_precision
    assert accuracy == expected_accuracy
    assert NO_ENTITY not in report
Exemple #2
0
async def test(
    stories: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    out_directory: Optional[Text] = None,
    fail_on_prediction_errors: bool = False,
    e2e: bool = False,
    disable_plotting: bool = False,
    successes: bool = False,
    errors: bool = True,
    warnings: bool = True,
) -> Dict[Text, Any]:
    """Run the evaluation of the stories, optionally plot the results.

    Args:
        stories: the stories to evaluate on
        agent: the agent
        max_stories: maximum number of stories to consider
        out_directory: path to directory to results to
        fail_on_prediction_errors: boolean indicating whether to fail on prediction
            errors or not
        e2e: boolean indicating whether to use end to end evaluation or not
        disable_plotting: boolean indicating whether to disable plotting or not
        successes: boolean indicating whether to write down successful predictions or
            not
        errors: boolean indicating whether to write down incorrect predictions or not
        warnings: boolean indicating whether to write down prediction warnings or not

    Returns:
        Evaluation summary.
    """
    from rasa.model_testing import get_evaluation_metrics

    generator = _create_data_generator(stories, agent, max_stories, e2e)
    completed_trackers = generator.generate_story_trackers()

    story_evaluation, _, entity_results = await _collect_story_predictions(
        completed_trackers, agent, fail_on_prediction_errors, use_e2e=e2e)

    evaluation_store = story_evaluation.evaluation_store

    with pywarnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        pywarnings.simplefilter("ignore", UndefinedMetricWarning)

        targets, predictions = evaluation_store.serialise()

        if out_directory:
            report, precision, f1, action_accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

            # Add conversation level accuracy to story report.
            num_failed = len(story_evaluation.failed_stories)
            num_correct = len(story_evaluation.successful_stories)
            num_warnings = len(story_evaluation.stories_with_warnings)
            num_convs = num_failed + num_correct
            if num_convs and isinstance(report, Dict):
                conv_accuracy = num_correct / num_convs
                report["conversation_accuracy"] = {
                    "accuracy": conv_accuracy,
                    "correct": num_correct,
                    "with_warnings": num_warnings,
                    "total": num_convs,
                }
            report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
            rasa.shared.utils.io.dump_obj_as_json_to_file(
                report_filename, report)
            logger.info(f"Stories report saved to {report_filename}.")

        else:
            report, precision, f1, action_accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

        evaluate_entities(
            entity_results,
            POLICIES_THAT_EXTRACT_ENTITIES,
            out_directory,
            successes,
            errors,
            disable_plotting,
        )

    telemetry.track_core_model_test(len(generator.story_graph.story_steps),
                                    e2e, agent)

    _log_evaluation_table(
        evaluation_store.action_targets,
        "ACTION",
        action_accuracy,
        precision=precision,
        f1=f1,
        in_training_data_fraction=story_evaluation.in_training_data_fraction,
    )

    if not disable_plotting and out_directory:
        _plot_story_evaluation(
            evaluation_store.action_targets,
            evaluation_store.action_predictions,
            out_directory,
        )

    if errors and out_directory:
        _log_stories(
            story_evaluation.failed_stories,
            os.path.join(out_directory, FAILED_STORIES_FILE),
            "None of the test stories failed - all good!",
        )
    if successes and out_directory:
        _log_stories(
            story_evaluation.successful_stories,
            os.path.join(out_directory, SUCCESSFUL_STORIES_FILE),
            "None of the test stories succeeded :(",
        )
    if warnings and out_directory:
        _log_stories(
            story_evaluation.stories_with_warnings,
            os.path.join(out_directory, STORIES_WITH_WARNINGS_FILE),
            "No warnings for test stories",
        )

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": action_accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": e2e,
    }
Exemple #3
0
    async def run_test_on_stories(stories_path: str, model_path: str, e2e: bool = False):
        """
        Run tests on stories.

        Args:
            stories_path: path where test stories are present as YAML.
            model_path: Model path where model on which test has to be run is present.
            e2e: if True, end to end test is initiated where intent prediction is also done along with action prediction.

        Returns: dictionary with evaluation results
        """
        from rasa.model_testing import get_evaluation_metrics
        from rasa.core.test import _create_data_generator, _collect_story_predictions
        from rasa.core.agent import Agent

        test_report = {}
        agent = Agent.load(model_path)

        generator = await _create_data_generator(stories_path, agent, use_conversation_test_files=e2e)
        completed_trackers = generator.generate_story_trackers()

        story_evaluation, _, _ = await _collect_story_predictions(
            completed_trackers, agent, use_e2e=e2e
        )
        targets, predictions = story_evaluation.evaluation_store.serialise()
        report, precision, f1, accuracy = get_evaluation_metrics(targets, predictions, output_dict=True)
        failed_stories_summary = []
        success_stories_summary = []
        for story in story_evaluation.failed_stories:
            events_tracker = []
            for event in story.events:
                events_tracker.append(vars(event))
            failed_stories_summary.append({'name': story.sender_id, 'events': events_tracker})

        for story in story_evaluation.successful_stories:
            events_tracker = []
            for event in story.events:
                events_tracker.append(vars(event))
            success_stories_summary.append({'name': story.sender_id, 'events': events_tracker})

        num_failed = len(story_evaluation.failed_stories)
        num_correct = len(story_evaluation.successful_stories)
        num_warnings = len(story_evaluation.stories_with_warnings)
        num_convs = num_failed + num_correct
        if num_convs and isinstance(report, Dict):
            conv_accuracy = num_correct / num_convs
            test_report["conversation_accuracy"] = {
                "accuracy": conv_accuracy,
                "success_count": num_correct,
                "failure_count": num_failed,
                "total_count": num_convs,
                "with_warnings": num_warnings,
            }

        test_report.update({
            # "report": report,
            "precision": precision,
            "f1": f1,
            "accuracy": accuracy,
            # "actions": story_evaluation.action_list,
            # "in_training_data_fraction": story_evaluation.in_training_data_fraction,
            # "is_end_to_end_evaluation": e2e,
            "failed_stories": failed_stories_summary,
            # "successful_stories": success_stories_summary,
        })
        return test_report
Exemple #4
0
    def __evaluate_entities(entity_results, extractors: Set[Text]) -> Dict:
        """
        Creates summary statistics for each entity extractor.

        Logs precision, recall, and F1 per entity type for each extractor.

        Args:
            entity_results: entity evaluation results
            extractors: entity extractors to consider

        Returns: dictionary with evaluation results
        """
        from rasa.model_testing import get_evaluation_metrics
        from rasa.nlu.test import (
            NO_ENTITY,
            align_all_entity_predictions,
            merge_labels,
            substitute_labels,
            collect_successful_entity_predictions,
            collect_incorrect_entity_predictions
        )

        aligned_predictions = align_all_entity_predictions(entity_results, extractors)
        merged_targets = merge_labels(aligned_predictions)
        from rasa.shared.nlu.constants import NO_ENTITY_TAG
        merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY)

        result = {}

        for extractor in extractors:
            merged_predictions = merge_labels(aligned_predictions, extractor)
            merged_predictions = substitute_labels(
                merged_predictions, NO_ENTITY_TAG, NO_ENTITY
            )

            report, precision, f1, accuracy = get_evaluation_metrics(
                    merged_targets,
                    merged_predictions,
                    output_dict=False,
                    exclude_label=NO_ENTITY,
                )

            successes = collect_successful_entity_predictions(
                entity_results, merged_predictions, merged_targets
            )
            errors = collect_incorrect_entity_predictions(
                entity_results, merged_predictions, merged_targets
            )

            result[extractor] = {
                "total_count": len(successes) + len(errors),
                "success_count": len(successes),
                "failure_count": len(errors),
                "precision": precision,
                "f1_score": f1,
                "accuracy": accuracy,
                # 'successes': successes,
                'errors': errors
            }

        return result