Exemple #1
0
async def test_events_schema(
    monkeypatch: MonkeyPatch, default_agent: Agent, config_path: Text
):
    # this allows us to patch the printing part used in debug mode to collect the
    # reported events
    monkeypatch.setenv("RASA_TELEMETRY_DEBUG", "true")
    monkeypatch.setenv("RASA_TELEMETRY_ENABLED", "true")

    mock = Mock()
    monkeypatch.setattr(telemetry, "print_telemetry_event", mock)

    with open(TELEMETRY_EVENTS_JSON) as f:
        schemas = json.load(f)["events"]
    initial = asyncio.all_tasks()
    # Generate all known backend telemetry events, and then use events.json to
    # validate their schema.
    training_data = TrainingDataImporter.load_from_config(config_path)

    with telemetry.track_model_training(training_data, "rasa"):
        await asyncio.sleep(1)

    telemetry.track_telemetry_disabled()

    telemetry.track_data_split(0.5, "nlu")

    telemetry.track_validate_files(True)

    telemetry.track_data_convert("yaml", "nlu")

    telemetry.track_tracker_export(5, TrackerStore(domain=None), EventBroker())

    telemetry.track_interactive_learning_start(True, False)

    telemetry.track_server_start([CmdlineInput()], None, None, 42, True)

    telemetry.track_project_init("tests/")

    telemetry.track_shell_started("nlu")

    telemetry.track_rasa_x_local()

    telemetry.track_visualization()

    telemetry.track_core_model_test(5, True, default_agent)

    telemetry.track_nlu_model_test(TrainingData())

    pending = asyncio.all_tasks() - initial
    await asyncio.gather(*pending)

    assert mock.call_count == 15

    for args, _ in mock.call_args_list:
        event = args[0]
        # `metrics_id` automatically gets added to all event but is
        # not part of the schema so we need to remove it before validation
        del event["properties"]["metrics_id"]
        jsonschema.validate(
            instance=event["properties"], schema=schemas[event["event"]]
        )
Exemple #2
0
async def test(
    stories: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    out_directory: Optional[Text] = None,
    fail_on_prediction_errors: bool = False,
    e2e: bool = False,
    disable_plotting: bool = False,
    successes: bool = False,
    errors: bool = True,
) -> Dict[Text, Any]:
    """Run the evaluation of the stories, optionally plot the results.

    Args:
        stories: the stories to evaluate on
        agent: the agent
        max_stories: maximum number of stories to consider
        out_directory: path to directory to results to
        fail_on_prediction_errors: boolean indicating whether to fail on prediction
            errors or not
        e2e: boolean indicating whether to use end to end evaluation or not
        disable_plotting: boolean indicating whether to disable plotting or not
        successes: boolean indicating whether to write down successful predictions or
            not
        errors: boolean indicating whether to write down incorrect predictions or not

    Returns:
        Evaluation summary.
    """
    from rasa.test import get_evaluation_metrics

    generator = await _create_data_generator(stories, agent, max_stories, e2e)
    completed_trackers = generator.generate_story_trackers()

    story_evaluation, _ = await _collect_story_predictions(
        completed_trackers, agent, fail_on_prediction_errors, e2e)

    evaluation_store = story_evaluation.evaluation_store

    with warnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        warnings.simplefilter("ignore", UndefinedMetricWarning)

        targets, predictions = evaluation_store.serialise()

        if out_directory:
            report, precision, f1, accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

            report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
            rasa.shared.utils.io.dump_obj_as_json_to_file(
                report_filename, report)
            logger.info(f"Stories report saved to {report_filename}.")
        else:
            report, precision, f1, accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

    telemetry.track_core_model_test(len(generator.story_graph.story_steps),
                                    e2e, agent)

    _log_evaluation_table(
        evaluation_store.action_targets,
        "ACTION",
        report,
        precision,
        f1,
        accuracy,
        story_evaluation.in_training_data_fraction,
        include_report=False,
    )

    if not disable_plotting and out_directory:
        _plot_story_evaluation(
            evaluation_store.action_targets,
            evaluation_store.action_predictions,
            out_directory,
        )

    if errors and out_directory:
        _log_stories(
            story_evaluation.failed_stories,
            os.path.join(out_directory, FAILED_STORIES_FILE),
        )
    if successes and out_directory:
        _log_stories(
            story_evaluation.successful_stories,
            os.path.join(out_directory, SUCCESSFUL_STORIES_FILE),
        )

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": e2e,
    }
Exemple #3
0
async def test(
    stories: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    out_directory: Optional[Text] = None,
    fail_on_prediction_errors: bool = False,
    e2e: bool = False,
    disable_plotting: bool = False,
    successes: bool = False,
    errors: bool = True,
    warnings: bool = True,
) -> Dict[Text, Any]:
    """Run the evaluation of the stories, optionally plot the results.

    Args:
        stories: the stories to evaluate on
        agent: the agent
        max_stories: maximum number of stories to consider
        out_directory: path to directory to results to
        fail_on_prediction_errors: boolean indicating whether to fail on prediction
            errors or not
        e2e: boolean indicating whether to use end to end evaluation or not
        disable_plotting: boolean indicating whether to disable plotting or not
        successes: boolean indicating whether to write down successful predictions or
            not
        errors: boolean indicating whether to write down incorrect predictions or not
        warnings: boolean indicating whether to write down prediction warnings or not

    Returns:
        Evaluation summary.
    """
    from rasa.model_testing import get_evaluation_metrics

    generator = _create_data_generator(stories, agent, max_stories, e2e)
    completed_trackers = generator.generate_story_trackers()

    story_evaluation, _, entity_results = await _collect_story_predictions(
        completed_trackers, agent, fail_on_prediction_errors, use_e2e=e2e)

    evaluation_store = story_evaluation.evaluation_store

    with pywarnings.catch_warnings():
        from sklearn.exceptions import UndefinedMetricWarning

        pywarnings.simplefilter("ignore", UndefinedMetricWarning)

        targets, predictions = evaluation_store.serialise()

        if out_directory:
            report, precision, f1, action_accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

            # Add conversation level accuracy to story report.
            num_failed = len(story_evaluation.failed_stories)
            num_correct = len(story_evaluation.successful_stories)
            num_warnings = len(story_evaluation.stories_with_warnings)
            num_convs = num_failed + num_correct
            if num_convs and isinstance(report, Dict):
                conv_accuracy = num_correct / num_convs
                report["conversation_accuracy"] = {
                    "accuracy": conv_accuracy,
                    "correct": num_correct,
                    "with_warnings": num_warnings,
                    "total": num_convs,
                }
            report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
            rasa.shared.utils.io.dump_obj_as_json_to_file(
                report_filename, report)
            logger.info(f"Stories report saved to {report_filename}.")

        else:
            report, precision, f1, action_accuracy = get_evaluation_metrics(
                targets, predictions, output_dict=True)

        evaluate_entities(
            entity_results,
            POLICIES_THAT_EXTRACT_ENTITIES,
            out_directory,
            successes,
            errors,
            disable_plotting,
        )

    telemetry.track_core_model_test(len(generator.story_graph.story_steps),
                                    e2e, agent)

    _log_evaluation_table(
        evaluation_store.action_targets,
        "ACTION",
        action_accuracy,
        precision=precision,
        f1=f1,
        in_training_data_fraction=story_evaluation.in_training_data_fraction,
    )

    if not disable_plotting and out_directory:
        _plot_story_evaluation(
            evaluation_store.action_targets,
            evaluation_store.action_predictions,
            out_directory,
        )

    if errors and out_directory:
        _log_stories(
            story_evaluation.failed_stories,
            os.path.join(out_directory, FAILED_STORIES_FILE),
            "None of the test stories failed - all good!",
        )
    if successes and out_directory:
        _log_stories(
            story_evaluation.successful_stories,
            os.path.join(out_directory, SUCCESSFUL_STORIES_FILE),
            "None of the test stories succeeded :(",
        )
    if warnings and out_directory:
        _log_stories(
            story_evaluation.stories_with_warnings,
            os.path.join(out_directory, STORIES_WITH_WARNINGS_FILE),
            "No warnings for test stories",
        )

    return {
        "report": report,
        "precision": precision,
        "f1": f1,
        "accuracy": action_accuracy,
        "actions": story_evaluation.action_list,
        "in_training_data_fraction":
        story_evaluation.in_training_data_fraction,
        "is_end_to_end_evaluation": e2e,
    }