Esempio n. 1
0
def test_get_evaluation_metrics(targets, predictions, expected_precision,
                                expected_fscore, expected_accuracy):
    from rasa.test import get_evaluation_metrics

    report, precision, f1, accuracy = get_evaluation_metrics(
        targets, predictions, True, exclude_label=NO_ENTITY)

    assert f1 == expected_fscore
    assert precision == expected_precision
    assert accuracy == expected_accuracy
    assert NO_ENTITY not in report
Esempio n. 2
0
async def test(
    stories: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    out_directory: Optional[Text] = None,
    fail_on_prediction_errors: bool = False,
    e2e: bool = False,
    disable_plotting: bool = False,
    successes: bool = False,
    errors: bool = True,
) -> Dict[Text, Any]:
    """Run the evaluation of the stories, optionally plot the results.

    Args:
        stories: the stories to evaluate on
        agent: the agent
        max_stories: maximum number of stories to consider
        out_directory: path to directory to results to
        fail_on_prediction_errors: boolean indicating whether to fail on prediction
            errors or not
        e2e: boolean indicating whether to use end to end evaluation or not
        disable_plotting: boolean indicating whether to disable plotting or not
        successes: boolean indicating whether to write down successful predictions or
            not
        errors: boolean indicating whether to write down incorrect predictions or not

    Returns:
        Evaluation summary.
    """
    from rasa.test import get_evaluation_metrics

    generator = await _create_data_generator(stories, agent, max_stories, e2e)
    completed_trackers = generator.generate_story_trackers()

    story_evaluation, _ = await _collect_story_predictions(
        completed_trackers, agent, fail_on_prediction_errors, e2e)

    evaluation_store = story_evaluation.evaluation_store

    with warnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        warnings.simplefilter("ignore", UndefinedMetricWarning)

        targets, predictions = evaluation_store.serialise()

        if out_directory:
            report, precision, f1, accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

            report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
            rasa.shared.utils.io.dump_obj_as_json_to_file(
                report_filename, report)
            logger.info(f"Stories report saved to {report_filename}.")
        else:
            report, precision, f1, accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

    telemetry.track_core_model_test(len(generator.story_graph.story_steps),
                                    e2e, agent)

    _log_evaluation_table(
        evaluation_store.action_targets,
        "ACTION",
        report,
        precision,
        f1,
        accuracy,
        story_evaluation.in_training_data_fraction,
        include_report=False,
    )

    if not disable_plotting and out_directory:
        _plot_story_evaluation(
            evaluation_store.action_targets,
            evaluation_store.action_predictions,
            out_directory,
        )

    if errors and out_directory:
        _log_stories(
            story_evaluation.failed_stories,
            os.path.join(out_directory, FAILED_STORIES_FILE),
        )
    if successes and out_directory:
        _log_stories(
            story_evaluation.successful_stories,
            os.path.join(out_directory, SUCCESSFUL_STORIES_FILE),
        )

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": e2e,
    }
Esempio n. 3
0
async def _collect_story_predictions(
    completed_trackers: List["DialogueStateTracker"],
    agent: "Agent",
    fail_on_prediction_errors: bool = False,
    use_e2e: bool = False,
) -> Tuple[StoryEvaluation, int]:
    """Test the stories from a file, running them through the stored model."""
    from rasa.test import get_evaluation_metrics
    from tqdm import tqdm

    story_eval_store = EvaluationStore()
    failed = []
    success = []
    correct_dialogues = []
    number_of_stories = len(completed_trackers)

    logger.info(f"Evaluating {number_of_stories} stories\nProgress:")

    action_list = []

    for tracker in tqdm(completed_trackers):
        (
            tracker_results,
            predicted_tracker,
            tracker_actions,
        ) = await _predict_tracker_actions(tracker, agent,
                                           fail_on_prediction_errors, use_e2e)

        story_eval_store.merge_store(tracker_results)

        action_list.extend(tracker_actions)

        if tracker_results.has_prediction_target_mismatch():
            # there is at least one wrong prediction
            failed.append(predicted_tracker)
            correct_dialogues.append(0)
        else:
            correct_dialogues.append(1)
            success.append(predicted_tracker)

    logger.info("Finished collecting predictions.")
    with warnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        warnings.simplefilter("ignore", UndefinedMetricWarning)
        report, precision, f1, accuracy = get_evaluation_metrics(
            [1] * len(completed_trackers), correct_dialogues)

    in_training_data_fraction = _in_training_data_fraction(action_list)

    _log_evaluation_table(
        [1] * len(completed_trackers),
        "END-TO-END" if use_e2e else "CONVERSATION",
        report,
        precision,
        f1,
        accuracy,
        in_training_data_fraction,
        include_report=False,
    )

    return (
        StoryEvaluation(
            evaluation_store=story_eval_store,
            failed_stories=failed,
            successful_stories=success,
            action_list=action_list,
            in_training_data_fraction=in_training_data_fraction,
        ),
        number_of_stories,
    )