Ejemplo n.º 1
0
def evaluate_entities(entity_results, extractors):  # pragma: no cover
    """Creates summary statistics for each entity extractor.
    Logs precision, recall, and F1 per entity type for each extractor."""

    aligned_predictions = align_all_entity_predictions(entity_results, extractors)
    merged_targets = merge_labels(aligned_predictions)
    merged_targets = substitute_labels(merged_targets, "O", "no_entity")

    result = {}

    for extractor in extractors:
        merged_predictions = merge_labels(aligned_predictions, extractor)
        merged_predictions = substitute_labels(merged_predictions, "O", "no_entity")

        report, precision, f1, accuracy = get_evaluation_metrics(
            merged_targets,
            merged_predictions,
            output_dict=True,
            exclude_label="no_entity",
        )

        result = {
            "report": report,
            "precision": precision,
            "f1_score": f1,
            "accuracy": accuracy,
        }

    return result
Ejemplo n.º 2
0
def collect_story_predictions(
        completed_trackers: List['DialogueStateTracker'],
        agent: 'Agent',
        fail_on_prediction_errors: bool = False,
        use_e2e: bool = False) -> Tuple[StoryEvalution, int]:
    """Test the stories from a file, running them through the stored model."""
    from rasa.nlu.test import get_evaluation_metrics
    from tqdm import tqdm

    story_eval_store = EvaluationStore()
    failed = []
    correct_dialogues = []
    num_stories = len(completed_trackers)

    logger.info("Evaluating {} stories\n" "Progress:".format(num_stories))

    action_list = []

    for tracker in tqdm(completed_trackers):
        tracker_results, predicted_tracker, tracker_actions = \
            _predict_tracker_actions(tracker, agent,
                                     fail_on_prediction_errors, use_e2e)

        story_eval_store.merge_store(tracker_results)

        action_list.extend(tracker_actions)

        if tracker_results.has_prediction_target_mismatch():
            # there is at least one wrong prediction
            failed.append(predicted_tracker)
            correct_dialogues.append(0)
        else:
            correct_dialogues.append(1)

    logger.info("Finished collecting predictions.")
    with warnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        warnings.simplefilter("ignore", UndefinedMetricWarning)
        report, precision, f1, accuracy = get_evaluation_metrics(
            [1] * len(completed_trackers), correct_dialogues)

    in_training_data_fraction = _in_training_data_fraction(action_list)

    log_evaluation_table([1] * len(completed_trackers),
                         "END-TO-END" if use_e2e else "CONVERSATION",
                         report,
                         precision,
                         f1,
                         accuracy,
                         in_training_data_fraction,
                         include_report=False)

    return (StoryEvalution(
        evaluation_store=story_eval_store,
        failed_stories=failed,
        action_list=action_list,
        in_training_data_fraction=in_training_data_fraction), num_stories)
Ejemplo n.º 3
0
def test_get_evaluation_metrics(targets, predictions, expected_precision,
                                expected_fscore, expected_accuracy):
    report, precision, f1, accuracy = get_evaluation_metrics(
        targets, predictions, True, exclude_label=NO_ENTITY)

    assert f1 == expected_fscore
    assert precision == expected_precision
    assert accuracy == expected_accuracy
    assert NO_ENTITY not in report
Ejemplo n.º 4
0
async def test(
    stories: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    out_directory: Optional[Text] = None,
    fail_on_prediction_errors: bool = False,
    e2e: bool = False,
    disable_plotting: bool = False,
):
    """Run the evaluation of the stories, optionally plot the results."""
    from rasa.nlu.test import get_evaluation_metrics

    completed_trackers = await _generate_trackers(stories, agent, max_stories,
                                                  e2e)

    story_evaluation, _ = collect_story_predictions(completed_trackers, agent,
                                                    fail_on_prediction_errors,
                                                    e2e)

    evaluation_store = story_evaluation.evaluation_store

    with warnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        warnings.simplefilter("ignore", UndefinedMetricWarning)

        targets, predictions = evaluation_store.serialise()
        report, precision, f1, accuracy = get_evaluation_metrics(
            targets, predictions)

    if out_directory:
        plot_story_evaluation(
            evaluation_store.action_targets,
            evaluation_store.action_predictions,
            report,
            precision,
            f1,
            accuracy,
            story_evaluation.in_training_data_fraction,
            out_directory,
            disable_plotting,
        )

    log_failed_stories(story_evaluation.failed_stories, out_directory)

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": e2e,
    }
Ejemplo n.º 5
0
def evaluate_intents(intent_results):  # pragma: no cover
    """Creates a confusion matrix and summary statistics for intent predictions.

    Log samples which could not be classified correctly and save them to file.
    Creates a confidence histogram which is saved to file.
    Wrong and correct prediction confidences will be
    plotted in separate bars of the same histogram plot.
    Only considers those examples with a set intent.
    Others are filtered out. Returns a dictionary of containing the
    evaluation result.
    """

    # remove empty intent targets
    intent_results = remove_empty_intent_examples(intent_results)

    target_intents, predicted_intents = _targets_predictions_from(
        intent_results, "intent_target", "intent_prediction"
    )

    report, precision, f1, accuracy = get_evaluation_metrics(
        target_intents, predicted_intents, output_dict=True
    )

    log = collect_nlu_errors(intent_results) + collect_nlu_successes(intent_results)

    predictions = [
        {
            "text": res.message,
            "intent": res.intent_target,
            "predicted": res.intent_prediction,
            "confidence": res.confidence,
        }
        for res in intent_results
    ]

    return {
        "predictions": predictions,
        "report": report,
        "precision": precision,
        "f1_score": f1,
        "accuracy": accuracy,
        "log": log,
    }