Exemple #1
0
def collect_story_predictions(
        completed_trackers,  # type: List[DialogueStateTracker]
        agent,  # type: Agent
        fail_on_prediction_errors=False,  # type: bool
        use_e2e=False  # type: bool
):
    # type: (...) -> StoryEvalution
    """Test the stories from a file, running them through the stored model."""

    story_eval_store = EvaluationStore()
    failed = []
    correct_dialogues = []
    num_stories = len(completed_trackers)

    logger.info("Evaluating {} stories\n" "Progress:".format(num_stories))

    action_list = []

    for tracker in tqdm(completed_trackers):
        tracker_results, predicted_tracker, tracker_actions = \
            _predict_tracker_actions(tracker, agent,
                                     fail_on_prediction_errors, use_e2e)

        story_eval_store.merge_store(tracker_results)

        action_list.extend(tracker_actions)

        if tracker_results.has_prediction_target_mismatch():
            # there is at least one wrong prediction
            failed.append(predicted_tracker)
            correct_dialogues.append(0)
        else:
            correct_dialogues.append(1)

    logger.info("Finished collecting predictions.")
    report, precision, f1, accuracy = get_evaluation_metrics(
        [1] * len(completed_trackers), correct_dialogues)

    in_training_data_fraction = _in_training_data_fraction(action_list)

    log_evaluation_table([1] * len(completed_trackers),
                         "END-TO-END" if use_e2e else "CONVERSATION",
                         report,
                         precision,
                         f1,
                         accuracy,
                         in_training_data_fraction,
                         include_report=False)

    return (StoryEvalution(
        evaluation_store=story_eval_store,
        failed_stories=failed,
        action_list=action_list,
        in_training_data_fraction=in_training_data_fraction), num_stories)
def test_entity():
    #load model
    interpreter = Interpreter.load(model_location)

    duckling_extractors = {"ner_duckling", "ner_duckling_http"}

    #create dictionary of entity results
    entity_results = defaultdict(lambda: defaultdict(list))

    #get extractors of the interpreter
    extractors = evaluate.get_entity_extractors(interpreter)

    #get entity predictions and tokens
    entity_predictions, tokens = evaluate.get_entity_predictions(
        interpreter, testing_data)

    # Create classification report
    if duckling_extractors.intersection(extractors):
        entity_predictions = evaluate.remove_duckling_entities(
            entity_predictions)
        extractors = evaluate.remove_duckling_extractors(extractors)

    if not extractors:
        return entity_results

    #get entity_targets
    entity_targets = evaluate.get_entity_targets(testing_data)

    #get aligned_prections
    aligned_predictions = evaluate.align_all_entity_predictions(
        entity_targets, entity_predictions, tokens, extractors)

    merged_targets = evaluate.merge_labels(aligned_predictions)
    merged_targets = evaluate.substitute_labels(merged_targets, "O",
                                                "no_entity")

    for extractor in extractors:
        merged_predictions = evaluate.merge_labels(aligned_predictions,
                                                   extractor)
        merged_predictions = evaluate.substitute_labels(
            merged_predictions, "O", "no_entity")
        report, precision, f1, accuracy = evaluate.get_evaluation_metrics(
            merged_targets, merged_predictions)
        entity_results[extractor]["Accuracy"].append(accuracy)
        entity_results[extractor]["F1-score"].append(f1)
        entity_results[extractor]["Precision"].append(precision)

    print("entity_results:  {}\n".format(entity_results),
          "Classification report: \n{}".format(report))
Exemple #3
0
def test_intent():
    #load model
    interpreter = Interpreter.load(model_location)
    # get true target of the testing data
    targets = evaluate.get_intent_targets(testing_data)
    # get predictions of the testing data
    predictions = evaluate.get_intent_predictions(interpreter, testing_data)
    #create a confusion matrix and summary statistics for intent predictions
    evaluate.evaluate_intents(targets, predictions)

    #generate classification report, precision, f1 score and accuary
    report, precision, f1, accuracy = evaluate.get_evaluation_metrics(
        targets, predictions)
    print("F1-Score:  {}\n".format(f1), "Precision: {}\n".format(precision),
          "Accuracy:  {}\n".format(accuracy),
          "Classification report: \n{}".format(report))
Exemple #4
0
def log_evaluation_table(golds,
                         predictions,
                         name,
                         include_report=True):  # pragma: no cover
    """Log the sklearn evaluation metrics."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UndefinedMetricWarning)
        report, precision, f1, accuracy = get_evaluation_metrics(
            golds, predictions)

    logger.info("Evaluation Results on {} level:".format(name))
    logger.info("\tCorrect:   {} / {}".format(int(len(golds) * accuracy),
                                              len(golds)))
    logger.info("\tF1-Score:  {:.3f}".format(f1))
    logger.info("\tPrecision: {:.3f}".format(precision))
    logger.info("\tAccuracy:  {:.3f}".format(accuracy))

    if include_report:
        logger.info("\tClassification report: \n{}".format(report))
Exemple #5
0
def run_story_evaluation(resource_name,
                         agent,
                         max_stories=None,
                         out_file_stories=None,
                         out_file_plot=None,
                         fail_on_prediction_errors=False,
                         use_e2e=False):
    """Run the evaluation of the stories, optionally plots the results."""

    completed_trackers = _generate_trackers(resource_name, agent, max_stories,
                                            use_e2e)

    story_evaluation = collect_story_predictions(completed_trackers, agent,
                                                 fail_on_prediction_errors,
                                                 use_e2e)

    evaluation_store = story_evaluation.evaluation_store

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UndefinedMetricWarning)
        report, precision, f1, accuracy = get_evaluation_metrics(
            evaluation_store.serialise_targets(),
            evaluation_store.serialise_predictions())

    if out_file_plot:
        plot_story_evaluation(evaluation_store.action_targets,
                              evaluation_store.action_predictions, report,
                              precision, f1, accuracy,
                              story_evaluation.in_training_data_fraction,
                              out_file_plot)

    log_failed_stories(story_evaluation.failed_stories, out_file_stories)

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": use_e2e
    }
Exemple #6
0
    def evaluate(self, data, project=None, model=None):
        # type: (Text, Optional[Text], Optional[Text]) -> Dict[Text, Any]
        """Perform a model evaluation."""

        project = project or RasaNLUModelConfig.DEFAULT_PROJECT_NAME
        model = model or None
        file_name = utils.create_temporary_file(data, "_training_data")
        test_data = load_data(file_name)

        if project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        preds_json = self.parse_training_examples(test_data.intent_examples,
                                                  project,
                                                  model)

        predictions = [
            {"text": e.text,
             "intent": e.data.get("intent"),
             "predicted": p.get("intent", {}).get("name"),
             "confidence": p.get("intent", {}).get("confidence")}
            for e, p in zip(test_data.intent_examples, preds_json)
        ]

        y_true = [e.data.get("intent") for e in test_data.intent_examples]
        y_true = clean_intent_labels(y_true)

        y_pred = [p.get("intent", {}).get("name") for p in preds_json]
        y_pred = clean_intent_labels(y_pred)

        report, precision, f1, accuracy = get_evaluation_metrics(y_true,
                                                                 y_pred)

        return {
            "intent_evaluation": {
                "report": report,
                "predictions": predictions,
                "precision": precision,
                "f1_score": f1,
                "accuracy": accuracy}
        }
Exemple #7
0
    def evaluate(self, data, project=None, model=None):
        # type: (Text, Optional[Text], Optional[Text]) -> Dict[Text, Any]
        """Perform a model evaluation."""

        project = project or RasaNLUModelConfig.DEFAULT_PROJECT_NAME
        model = model or None
        file_name = utils.create_temporary_file(data, "_training_data")
        test_data = load_data(file_name)

        if project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        preds_json = self.parse_training_examples(test_data.intent_examples,
                                                  project,
                                                  model)

        predictions = [
            {"text": e.text,
             "intent": e.data.get("intent"),
             "predicted": p.get("intent", {}).get("name"),
             "confidence": p.get("intent", {}).get("confidence")}
            for e, p in zip(test_data.intent_examples, preds_json)
        ]

        y_true = [e.data.get("intent") for e in test_data.intent_examples]
        y_true = clean_intent_labels(y_true)

        y_pred = [p.get("intent", {}).get("name") for p in preds_json]
        y_pred = clean_intent_labels(y_pred)

        report, precision, f1, accuracy = get_evaluation_metrics(y_true,
                                                                 y_pred)

        return {
            "intent_evaluation": {
                "report": report,
                "predictions": predictions,
                "precision": precision,
                "f1_score": f1,
                "accuracy": accuracy}
        }
def evaluate_intents(intent_results,
                     report_folder,
                     successes_filename,
                     errors_filename,
                     confmat_filename,
                     intent_hist_filename):  # pragma: no cover
    """Creates a confusion matrix and summary statistics for intent predictions.
    Log samples which could not be classified correctly and save them to file.
    Creates a confidence histogram which is saved to file.
    Wrong and correct prediction confidences will be
    plotted in separate bars of the same histogram plot.
    Only considers those examples with a set intent.
    Others are filtered out. Returns a dictionary of containing the
    evaluation result."""

    # remove empty intent targets
    num_examples = len(intent_results)
    intent_results = remove_empty_intent_examples(intent_results)

    logger.info("Intent Evaluation: Only considering those "
                "{} examples that have a defined intent out "
                "of {} examples".format(len(intent_results), num_examples))

    targets, predictions = _targets_predictions_from(intent_results)

    if report_folder:
        report, precision, f1, accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

        report_filename = os.path.join(report_folder, 'intent_report.json')

        save_json(report, report_filename)
        logger.info("Classification report saved to {}."
                    .format(report_filename))

    else:
        report, precision, f1, accuracy = get_evaluation_metrics(targets,
                                                                 predictions)
        log_evaluation_table(report, precision, f1, accuracy)

    if successes_filename:
        # save classified samples to file for debugging
        collect_nlu_successes(intent_results, successes_filename)

    if errors_filename:
        # log and save misclassified samples to file for debugging
        collect_nlu_errors(intent_results, errors_filename)

    if confmat_filename:
        from sklearn.metrics import confusion_matrix
        from sklearn.utils.multiclass import unique_labels
        import matplotlib.pyplot as plt

        cnf_matrix = confusion_matrix(targets, predictions)
        labels = unique_labels(targets, predictions)
        plot_confusion_matrix(cnf_matrix, classes=labels,
                              title='Intent Confusion matrix',
                              out=confmat_filename)
        plt.show()

        plot_intent_confidences(intent_results,
                                intent_hist_filename)

        plt.show()

    predictions = [
        {
            "text": res.message,
            "intent": res.target,
            "entities": res.target_entities,
            "predicted_entities": res.entities_prediction,
            "predicted": res.prediction,
            "confidence": res.confidence
        } for res in intent_results
    ]

    return {
        "predictions": predictions,
        "report": report,
        "precision": precision,
        "f1_score": f1,
        "accuracy": accuracy
    }