async def test_core_only(project: Text):
    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
    domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH)
    default_data_path = os.path.join(project, DEFAULT_DATA_PATH)
    actual = TrainingDataImporter.load_core_importer_from_config(
        config_path, domain_path, training_data_paths=[default_data_path]
    )

    assert isinstance(actual, CoreDataImporter)

    stories = await actual.get_stories()
    assert not stories.is_empty()

    domain = await actual.get_domain()
    assert not domain.is_empty()

    config = await actual.get_config()
    assert config

    nlu_data = await actual.get_nlu_data()
    assert nlu_data.is_empty()
Exemple #2
0
def test_example_bot_training_on_initial_project(tmp_path: Path):
    # we need to test this one separately, as we can't test it in place
    # configuration suggestions would otherwise change the initial file
    scaffold.create_initial_project(str(tmp_path))

    importer = TrainingDataImporter.load_from_config(
        str(tmp_path / "config.yml"),
        str(tmp_path / "domain.yml"),
        str(tmp_path / "data"),
    )

    with pytest.warns(UserWarning) as record:
        importer.get_nlu_data()
        importer.get_stories()

    # two for slot auto-fill removal
    assert len(record) == 2
    assert ("Slot auto-fill has been removed in 3.0 and replaced with "
            "a new explicit mechanism to set slots."
            in record[0].message.args[0])
    assert record[0].message.args[0] == record[1].message.args[0]
Exemple #3
0
async def _train_nlu_async(
    config: Text,
    nlu_data: Text,
    output: Text,
    train_path: Optional[Text] = None,
    fixed_model_name: Optional[Text] = None,
    persist_nlu_training_data: bool = False,
    additional_arguments: Optional[Dict] = None,
    domain: Optional[Union[Domain, Text]] = None,
    model_to_finetune: Optional[Text] = None,
    finetuning_epoch_fraction: float = 1.0,
) -> Optional[Text]:
    if not nlu_data:
        rasa.shared.utils.cli.print_error(
            "No NLU data given. Please provide NLU data in order to train "
            "a Rasa NLU model using the '--nlu' argument.")
        return

    # training NLU only hence the training files still have to be selected
    file_importer = TrainingDataImporter.load_nlu_importer_from_config(
        config, domain, training_data_paths=[nlu_data])

    training_data = await file_importer.get_nlu_data()
    if training_data.can_train_nlu_model():
        rasa.shared.utils.cli.print_error(
            f"Path '{nlu_data}' doesn't contain valid NLU data in it. "
            f"Please verify the data format. "
            f"The NLU model training will be skipped now.")
        return

    return await _train_nlu_with_validated_data(
        file_importer,
        output=output,
        train_path=train_path,
        fixed_model_name=fixed_model_name,
        persist_nlu_training_data=persist_nlu_training_data,
        additional_arguments=additional_arguments,
        model_to_finetune=model_to_finetune,
        finetuning_epoch_fraction=finetuning_epoch_fraction,
    )
Exemple #4
0
async def test_eval_data(component_builder, tmpdir, project):
    _config = RasaNLUModelConfig(
        {
            "pipeline": [
                {"name": "WhitespaceTokenizer"},
                {"name": "CountVectorsFeaturizer"},
                {"name": "DIETClassifier", "epochs": 2},
                {"name": "ResponseSelector", "epochs": 2},
            ],
            "language": "en",
        }
    )

    config_path = os.path.join(project, "config.yml")
    data_importer = TrainingDataImporter.load_nlu_importer_from_config(
        config_path,
        training_data_paths=[
            "data/examples/rasa/demo-rasa.md",
            "data/examples/rasa/demo-rasa-responses.md",
        ],
    )

    (_, _, persisted_path) = await train(
        _config,
        path=tmpdir.strpath,
        data=data_importer,
        component_builder=component_builder,
        persist_nlu_training_data=True,
    )

    interpreter = Interpreter.load(persisted_path, component_builder)

    data = await data_importer.get_nlu_data()
    intent_results, response_selection_results, entity_results, = get_eval_data(
        interpreter, data
    )

    assert len(intent_results) == 46
    assert len(response_selection_results) == 46
    assert len(entity_results) == 46
Exemple #5
0
def test_number_of_examples_per_intent_with_yaml(tmp_path: Path):
    domain_path = tmp_path / "domain.yml"
    domain_path.write_text(Domain.empty().as_yaml())

    config_path = tmp_path / "config.yml"
    config_path.touch()

    importer = TrainingDataImporter.load_from_dict(
        {},
        str(config_path),
        str(domain_path),
        [
            "data/test_number_nlu_examples/nlu.yml",
            "data/test_number_nlu_examples/stories.yml",
            "data/test_number_nlu_examples/rules.yml",
        ],
    )

    training_data = importer.get_nlu_data()
    assert training_data.intents == {"greet", "ask_weather"}
    assert training_data.number_of_examples_per_intent["greet"] == 2
    assert training_data.number_of_examples_per_intent["ask_weather"] == 3
Exemple #6
0
async def _create_data_generator(
    resource_name: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    use_conversation_test_files: bool = False,
) -> "TrainingDataGenerator":
    from rasa.shared.core.generator import TrainingDataGenerator

    test_data_importer = TrainingDataImporter.load_from_dict(
        training_data_paths=[resource_name])
    if use_conversation_test_files:
        story_graph = await test_data_importer.get_conversation_tests()
    else:
        story_graph = await test_data_importer.get_stories()

    return TrainingDataGenerator(
        story_graph,
        agent.domain,
        use_story_concatenation=False,
        augmentation_factor=0,
        tracker_limit=max_stories,
    )
Exemple #7
0
def test_no_warnings_with_default_project(tmp_path: Path):
    rasa.utils.common.copy_directory(Path("rasa/cli/initial_project"),
                                     tmp_path)

    importer = TrainingDataImporter.load_from_config(
        config_path=str(tmp_path / "config.yml"),
        domain_path=str(tmp_path / "domain.yml"),
        training_data_paths=[str(tmp_path / "data")],
    )

    config, _missing_keys, _configured_keys = DefaultV1Recipe.auto_configure(
        importer.get_config_file_for_auto_config(),
        importer.get_config(),
        TrainingType.END_TO_END,
    )
    graph_config = DefaultV1Recipe().graph_config_for_recipe(
        config, cli_parameters={}, training_type=TrainingType.END_TO_END)
    validator = DefaultV1RecipeValidator(graph_config.train_schema)

    with pytest.warns(None) as records:
        validator.validate(importer)
    assert len(records) == 0
Exemple #8
0
def test_should_not_retrain_core(domain_path: Text, tmp_path: Path,
                                 stack_config_path: Text):
    # Don't use `stories_path` as checkpoints currently break fingerprinting
    story_file = tmp_path / "simple_story.yml"
    story_file.write_text("""
stories:
- story: test_story
  steps:
  - intent: greet
  - action: utter_greet
    """)
    trained_model = train_core(domain_path, stack_config_path, str(story_file),
                               str(tmp_path))

    importer = TrainingDataImporter.load_from_config(
        stack_config_path, domain_path, training_data_paths=[str(story_file)])

    new_fingerprint = model.model_fingerprint(importer)

    result = model.should_retrain(new_fingerprint, trained_model, tmp_path)

    assert not result.should_retrain_core()
Exemple #9
0
async def test_eval_data(
    tmp_path: Path, project: Text, trained_rasa_model: Text,
):
    config_path = os.path.join(project, "config.yml")
    data_importer = TrainingDataImporter.load_nlu_importer_from_config(
        config_path,
        training_data_paths=[
            "data/examples/rasa/demo-rasa.yml",
            "data/examples/rasa/demo-rasa-responses.yml",
        ],
    )

    processor = Agent.load(trained_rasa_model).processor

    data = data_importer.get_nlu_data()
    (intent_results, response_selection_results, entity_results) = await get_eval_data(
        processor, data
    )

    assert len(intent_results) == 46
    assert len(response_selection_results) == 46
    assert len(entity_results) == 46
Exemple #10
0
    def inner(
        train_schema: GraphSchema,
        cache: Optional[TrainingCache] = None,
        model_storage: Optional[ModelStorage] = None,
        path: Optional[Path] = None,
        force_retraining: bool = False,
    ) -> Path:
        if not path:
            path = tmp_path_factory.mktemp("model_storage_path")
        if not model_storage:
            model_storage = LocalModelStorage.create(path)
        if not cache:
            cache = local_cache_creator(path)

        graph_trainer = GraphTrainer(
            model_storage=model_storage,
            cache=cache,
            graph_runner_class=DaskGraphRunner,
        )

        output_filename = path / "model.tar.gz"
        graph_trainer.train(
            GraphModelConfiguration(
                train_schema=train_schema,
                predict_schema=GraphSchema({}),
                language=None,
                core_target=None,
                nlu_target="nlu",
                training_type=TrainingType.BOTH,
            ),
            importer=TrainingDataImporter.load_from_dict(
                domain_path=str(domain_path)),
            output_filename=output_filename,
            force_retraining=force_retraining,
        )

        assert output_filename.is_file()
        return output_filename
Exemple #11
0
def test_core_warn_if_rule_data_unused(
    policy_type_not_consuming_rule_data: Type[Policy], ):

    importer = TrainingDataImporter.load_from_dict(
        domain_path="data/test_moodbot/domain.yml",
        training_data_paths=[
            "data/test_moodbot/data/nlu.yml",
            "data/test_moodbot/data/rules.yml",
        ],
    )

    graph_schema = GraphSchema({
        "policy":
        SchemaNode({}, policy_type_not_consuming_rule_data, "", "", {})
    })
    validator = DefaultV1RecipeValidator(graph_schema)

    with pytest.warns(
            UserWarning,
            match=("Found rule-based training data but no policy "
                   "supporting rule-based data."),
    ):
        validator.validate(importer)
Exemple #12
0
def test_core_warn_if_rule_data_missing(
        policy_type_consuming_rule_data: Type[Policy]):

    importer = TrainingDataImporter.load_from_dict(
        domain_path="data/test_e2ebot/domain.yml",
        training_data_paths=[
            "data/test_e2ebot/data/nlu.yml",
            "data/test_e2ebot/data/stories.yml",
        ],
    )

    graph_schema = GraphSchema({
        "policy":
        SchemaNode({}, policy_type_consuming_rule_data, "", "", {})
    })
    validator = DefaultV1RecipeValidator(graph_schema)

    with pytest.warns(
            UserWarning,
            match=("Found a rule-based policy in your configuration "
                   "but no rule-based training data."),
    ):
        validator.validate(importer)
Exemple #13
0
def test_nlu_only(project: Text):
    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
    default_data_path = os.path.join(project, DEFAULT_DATA_PATH)
    actual = TrainingDataImporter.load_nlu_importer_from_config(
        config_path, training_data_paths=[default_data_path])

    assert isinstance(actual, NluDataImporter)
    assert isinstance(actual._importer, ResponsesSyncImporter)

    stories = actual.get_stories()
    assert stories.is_empty()

    conversation_tests = actual.get_stories()
    assert conversation_tests.is_empty()

    domain = actual.get_domain()
    assert domain.is_empty()

    config = actual.get_config()
    assert config

    nlu_data = actual.get_nlu_data()
    assert not nlu_data.is_empty()
Exemple #14
0
async def test_nlu_data_domain_sync_with_retrieval_intents(project: Text):
    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
    domain_path = "data/test_domains/default_retrieval_intents.yml"
    data_paths = [
        "data/test_nlu/default_retrieval_intents.md",
        "data/test_responses/default.md",
    ]
    base_data_importer = TrainingDataImporter.load_from_dict({}, config_path,
                                                             domain_path,
                                                             data_paths)

    nlu_importer = NluDataImporter(base_data_importer)
    core_importer = CoreDataImporter(base_data_importer)

    importer = RetrievalModelsDataImporter(
        CombinedDataImporter([nlu_importer, core_importer]))
    domain = await importer.get_domain()
    nlu_data = await importer.get_nlu_data()

    assert domain.retrieval_intents == ["chitchat"]
    assert domain.intent_properties["chitchat"].get("is_retrieval_intent")
    assert domain.templates == nlu_data.responses
    assert "utter_chitchat" in domain.action_names
Exemple #15
0
def interactive(args: argparse.Namespace) -> None:
    _set_not_required_args(args)
    file_importer = TrainingDataImporter.load_from_config(
        args.config, args.domain, args.data if not args.core_only else [args.stories]
    )

    if args.model is None:
        story_graph = file_importer.get_stories()
        if not story_graph or story_graph.is_empty():
            rasa.shared.utils.cli.print_error_and_exit(
                "Could not run interactive learning without either core "
                "data or a model containing core data."
            )

        zipped_model = (
            train.run_core_training(args)
            if args.core_only
            else train.run_training(args)
        )
        if not zipped_model:
            rasa.shared.utils.cli.print_error_and_exit(
                "Could not train an initial model. Either pass paths "
                "to the relevant training files (`--data`, `--config`, `--domain`), "
                "or use 'rasa train' to train a model."
            )
    else:
        zipped_model = get_provided_model(args.model)
        if not (zipped_model and os.path.exists(zipped_model)):
            rasa.shared.utils.cli.print_error_and_exit(
                f"Interactive learning process cannot be started as no "
                f"initial model was found at path '{args.model}'.  "
                f"Use 'rasa train' to train a model."
            )
        if not args.skip_visualization:
            logger.info(f"Loading visualization data from {args.data}.")

    perform_interactive_learning(args, zipped_model, file_importer)
Exemple #16
0
def _create_data_generator(
    resource_name: Text,
    agent: "Agent",
    max_stories: Optional[int] = None,
    use_conversation_test_files: bool = False,
) -> "TrainingDataGenerator":
    from rasa.shared.core.generator import TrainingDataGenerator

    tmp_domain_path = Path(tempfile.mkdtemp()) / "domain.yaml"
    agent.domain.persist(tmp_domain_path)
    test_data_importer = TrainingDataImporter.load_from_dict(
        training_data_paths=[resource_name], domain_path=str(tmp_domain_path))
    if use_conversation_test_files:
        story_graph = test_data_importer.get_conversation_tests()
    else:
        story_graph = test_data_importer.get_stories()

    return TrainingDataGenerator(
        story_graph,
        agent.domain,
        use_story_concatenation=False,
        augmentation_factor=0,
        tracker_limit=max_stories,
    )
Exemple #17
0
async def test_import_nlu_training_data_with_default_actions(project: Text):
    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
    domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH)
    default_data_path = os.path.join(project, DEFAULT_DATA_PATH)
    importer = TrainingDataImporter.load_from_dict({}, config_path,
                                                   domain_path,
                                                   [default_data_path])

    assert isinstance(importer, E2EImporter)
    importer_without_e2e = importer.importer

    # Check additional NLU training data from domain was added
    nlu_data = await importer.get_nlu_data()

    assert len(nlu_data.training_examples) > len(
        (await importer_without_e2e.get_nlu_data()).training_examples)

    extended_training_data = await importer.get_nlu_data()
    assert all(
        Message(data={
            ACTION_NAME: action_name,
            ACTION_TEXT: ""
        }) in extended_training_data.training_examples
        for action_name in rasa.shared.core.constants.DEFAULT_ACTION_NAMES)
def test_no_warnings_with_default_project(tmp_path: Path):
    rasa.utils.common.copy_directory(Path("rasa/cli/initial_project"),
                                     tmp_path)

    importer = TrainingDataImporter.load_from_config(
        config_path=str(tmp_path / "config.yml"),
        domain_path=str(tmp_path / "domain.yml"),
        training_data_paths=[str(tmp_path / "data")],
    )

    graph_config = DefaultV1Recipe().graph_config_for_recipe(
        importer.get_config(),
        cli_parameters={},
        training_type=TrainingType.END_TO_END)
    validator = DefaultV1RecipeValidator(graph_config.train_schema)

    with pytest.warns(
            UserWarning,
            match="Slot auto-fill has been removed in 3.0") as records:
        validator.validate(importer)
    assert all([
        warn.message.args[0].startswith("Slot auto-fill has been removed")
        for warn in records.list
    ])
Exemple #19
0
    def _validate(
        self,
        importer: TrainingDataImporter,
        nlu: bool = True,
        core: bool = True,
    ) -> None:
        """Validate whether the finetuning setting conflicts with other settings.

        Note that this validation always takes into account the configuration of
        nlu *and* core part, while the validation of aspects of the domain and
        the NLU training data only happen if we request to validate finetuning
        with respect to NLU/Core models, respectively.

        For more details, see docstring of this class.

        Args:
            importer: a training data importer
            domain: the domain
            nlu: set to `False` if NLU part should not be validated
            core: set to `False` if Core part should not be validated
        Raises:
            `InvalidConfigException` if there is a conflict
        """
        if self._is_finetuning and not self._fingerprints:
            raise InvalidConfigException(
                f"Finetuning is enabled but the {self.__class__.__name__} "
                f"does not remember seeing a training run. Ensure that you have "
                f"trained your model at least once (with finetuning disabled) "
                f"and ensure that the  {self.__class__.__name__} is part of the "
                f"training graph. ")

        rasa_version = rasa.__version__
        if self._is_finetuning:
            old_rasa_version = self._fingerprints[FINGERPRINT_VERSION]
            if version.parse(old_rasa_version) < version.parse(
                    MINIMUM_COMPATIBLE_VERSION):
                raise InvalidConfigException(
                    f"The minimum compatible Rasa Version is "
                    f"{MINIMUM_COMPATIBLE_VERSION} but the model we attempt to "
                    f"finetune has been generated with an older version "
                    f"({old_rasa_version}.")
        self._fingerprints[FINGERPRINT_VERSION] = rasa_version

        config = importer.get_config()
        self._compare_or_memorize(
            fingerprint_key=FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY,
            new_fingerprint=self._get_fingerprint_of_config_without_epochs(
                config),
            error_message=
            ("Cannot finetune because more than just the 'epoch' keys have been "
             "changed in the configuration. "
             "Please revert your configuration and only change "
             "the 'epoch' settings where needed."),
        )

        if core:
            # NOTE: If there's a consistency check between domain and core training data
            # that ensures domain and core training data are consistent, then we can
            # drop this check.
            domain = importer.get_domain()
            self._compare_or_memorize(
                fingerprint_key=FINGERPRINT_CORE,
                new_fingerprint=self.
                _get_fingerprint_of_domain_without_responses(domain),
                error_message=
                ("Cannot finetune because more than just the responses have been "
                 "changed in the domain."
                 "Please revert all settings in your domain file (except the "
                 "'responses')."),
            )

        if nlu:
            nlu_data = importer.get_nlu_data()
            self._compare_or_memorize(
                fingerprint_key=FINGERPRINT_NLU,
                new_fingerprint=nlu_data.label_fingerprint(),
                error_message=
                ("Cannot finetune because NLU training data contains new labels "
                 "or does not contain any examples for some known labels. "
                 "Please make sure that the NLU data that you use "
                 "for finetuning contains at least one example for every label "
                 "(i.e. intent, action name, ...) that was included in the NLU "
                 "data used for training the model which we attempt to finetune "
                 "now. Moreover, you must not add labels that were not included "
                 "during training before. "),
            )

        self.persist()
Exemple #20
0
async def train_async(
    domain: Union[Domain, Text],
    config: Text,
    training_files: Optional[Union[Text, List[Text]]],
    output: Text = DEFAULT_MODELS_PATH,
    dry_run: bool = False,
    force_training: bool = False,
    fixed_model_name: Optional[Text] = None,
    persist_nlu_training_data: bool = False,
    core_additional_arguments: Optional[Dict] = None,
    nlu_additional_arguments: Optional[Dict] = None,
    model_to_finetune: Optional[Text] = None,
    finetuning_epoch_fraction: float = 1.0,
) -> TrainingResult:
    """Trains a Rasa model (Core and NLU).

    Args:
        domain: Path to the domain file.
        config: Path to the config for Core and NLU.
        training_files: Paths to the training data for Core and NLU.
        output_path: Output path.
        dry_run: If `True` then no training will be done, and the information about
            whether the training needs to be done will be printed.
        force_training: If `True` retrain model even if data has not changed.
        fixed_model_name: Name of model to be stored.
        persist_nlu_training_data: `True` if the NLU training data should be persisted
            with the model.
        core_additional_arguments: Additional training parameters for core training.
        nlu_additional_arguments: Additional training parameters forwarded to training
            method of each NLU component.
        model_to_finetune: Optional path to a model which should be finetuned or
            a directory in case the latest trained model should be used.
        finetuning_epoch_fraction: The fraction currently specified training epochs
            in the model configuration which should be used for finetuning.

    Returns:
        An instance of `TrainingResult`.
    """
    file_importer = TrainingDataImporter.load_from_config(
        config, domain, training_files)
    with TempDirectoryPath(tempfile.mkdtemp()) as train_path:
        domain = await file_importer.get_domain()

        if domain.is_empty():
            nlu_model = await handle_domain_if_not_exists(
                file_importer, output, fixed_model_name)
            return TrainingResult(model=nlu_model)

        return await _train_async_internal(
            file_importer,
            train_path,
            output,
            dry_run,
            force_training,
            fixed_model_name,
            persist_nlu_training_data,
            core_additional_arguments=core_additional_arguments,
            nlu_additional_arguments=nlu_additional_arguments,
            model_to_finetune=model_to_finetune,
            finetuning_epoch_fraction=finetuning_epoch_fraction,
        )
Exemple #21
0
async def _train_core_with_validated_data(
    file_importer: TrainingDataImporter,
    output: Text,
    train_path: Optional[Text] = None,
    fixed_model_name: Optional[Text] = None,
    additional_arguments: Optional[Dict] = None,
    interpreter: Optional[Interpreter] = None,
    model_to_finetune: Optional["Text"] = None,
    finetuning_epoch_fraction: float = 1.0,
) -> Optional[Text]:
    """Train Core with validated training and config data."""
    import rasa.core.train

    with ExitStack() as stack:
        if train_path:
            # If the train path was provided, do nothing on exit.
            _train_path = train_path
        else:
            # Otherwise, create a temp train path and clean it up on exit.
            _train_path = stack.enter_context(
                TempDirectoryPath(tempfile.mkdtemp()))

        # normal (not compare) training
        rasa.shared.utils.cli.print_color(
            "Training Core model...",
            color=rasa.shared.utils.io.bcolors.OKBLUE)
        domain, config = await asyncio.gather(file_importer.get_domain(),
                                              file_importer.get_config())

        if model_to_finetune:
            rasa.shared.utils.common.mark_as_experimental_feature(
                "Incremental Training feature")
            model_to_finetune = await _core_model_for_finetuning(
                model_to_finetune,
                file_importer=file_importer,
                finetuning_epoch_fraction=finetuning_epoch_fraction,
            )

            if not model_to_finetune:
                rasa.shared.utils.cli.print_error_and_exit(
                    f"No Core model for finetuning found. Please make sure to either "
                    f"specify a path to a previous model or to have a finetunable "
                    f"model within the directory '{output}'.")

        async with telemetry.track_model_training(
                file_importer,
                model_type="core",
                is_finetuning=model_to_finetune is not None,
        ):
            await rasa.core.train.train(
                domain_file=domain,
                training_resource=file_importer,
                output_path=os.path.join(_train_path,
                                         DEFAULT_CORE_SUBDIRECTORY_NAME),
                policy_config=config,
                additional_arguments=additional_arguments,
                interpreter=interpreter,
                model_to_finetune=model_to_finetune,
            )
        rasa.shared.utils.cli.print_color(
            "Core model training completed.",
            color=rasa.shared.utils.io.bcolors.OKBLUE)

        if train_path is None:
            # Only Core was trained.
            new_fingerprint = await model.model_fingerprint(file_importer)
            return model.package_model(
                fingerprint=new_fingerprint,
                output_directory=output,
                train_path=_train_path,
                fixed_model_name=fixed_model_name,
                model_prefix="core-",
            )

        return _train_path
Exemple #22
0
async def train_core_async(
    domain: Union[Domain, Text],
    config: Text,
    stories: Text,
    output: Text,
    train_path: Optional[Text] = None,
    fixed_model_name: Optional[Text] = None,
    additional_arguments: Optional[Dict] = None,
    model_to_finetune: Optional[Text] = None,
    finetuning_epoch_fraction: float = 1.0,
) -> Optional[Text]:
    """Trains a Core model.

    Args:
        domain: Path to the domain file.
        config: Path to the config file for Core.
        stories: Path to the Core training data.
        output: Output path.
        train_path: If `None` the model will be trained in a temporary
            directory, otherwise in the provided directory.
        fixed_model_name: Name of model to be stored.
        additional_arguments: Additional training parameters.
        model_to_finetune: Optional path to a model which should be finetuned or
            a directory in case the latest trained model should be used.
        finetuning_epoch_fraction: The fraction currently specified training epochs
            in the model configuration which should be used for finetuning.

    Returns:
        If `train_path` is given it returns the path to the model archive,
        otherwise the path to the directory with the trained model files.

    """
    file_importer = TrainingDataImporter.load_core_importer_from_config(
        config, domain, [stories])
    stories, nlu_data, domain = await asyncio.gather(
        file_importer.get_stories(),
        file_importer.get_nlu_data(),
        file_importer.get_domain(),
    )

    if nlu_data.has_e2e_examples():
        rasa.shared.utils.cli.print_error(
            "Stories file contains e2e stories. Please train using `rasa train` so that"
            " the NLU model is also trained.")
        return None

    if domain.is_empty():
        rasa.shared.utils.cli.print_error(
            "Core training was skipped because no valid domain file was found. "
            "Please specify a valid domain using '--domain' argument or check "
            "if the provided domain file exists.")
        return None

    if not stories:
        rasa.shared.utils.cli.print_error(
            "No stories given. Please provide stories in order to "
            "train a Rasa Core model using the '--stories' argument.")
        return

    return await _train_core_with_validated_data(
        file_importer,
        output=output,
        train_path=train_path,
        fixed_model_name=fixed_model_name,
        additional_arguments=additional_arguments,
        model_to_finetune=model_to_finetune,
        finetuning_epoch_fraction=finetuning_epoch_fraction,
    )
Exemple #23
0
async def _train_async_internal(
    file_importer: TrainingDataImporter,
    train_path: Text,
    output_path: Text,
    dry_run: bool,
    force_training: bool,
    fixed_model_name: Optional[Text],
    persist_nlu_training_data: bool,
    core_additional_arguments: Optional[Dict] = None,
    nlu_additional_arguments: Optional[Dict] = None,
    model_to_finetune: Optional[Text] = None,
    finetuning_epoch_fraction: float = 1.0,
) -> TrainingResult:
    """Trains a Rasa model (Core and NLU). Use only from `train_async`.

    Args:
        file_importer: `TrainingDataImporter` which supplies the training data.
        train_path: Directory in which to train the model.
        output_path: Output path.
        dry_run: If `True` then no training will be done, and the information about
            whether the training needs to be done will be printed.
        force_training: If `True` retrain model even if data has not changed.
        fixed_model_name: Name of model to be stored.
        persist_nlu_training_data: `True` if the NLU training data should be persisted
            with the model.
        core_additional_arguments: Additional training parameters for core training.
        nlu_additional_arguments: Additional training parameters forwarded to training
            method of each NLU component.
        model_to_finetune: Optional path to a model which should be finetuned or
            a directory in case the latest trained model should be used.
        finetuning_epoch_fraction: The fraction currently specified training epochs
            in the model configuration which should be used for finetuning.

    Returns:
        An instance of `TrainingResult`.
    """
    stories, nlu_data = await asyncio.gather(file_importer.get_stories(),
                                             file_importer.get_nlu_data())

    new_fingerprint = await model.model_fingerprint(file_importer)
    old_model = model.get_latest_model(output_path)

    fingerprint_comparison = model.should_retrain(
        new_fingerprint, old_model, train_path, force_training=force_training)

    if dry_run:
        code, texts = dry_run_result(fingerprint_comparison)
        for text in texts:
            print_warning(text) if code > 0 else print_success(text)
        return TrainingResult(code=code)

    if nlu_data.has_e2e_examples():
        rasa.shared.utils.common.mark_as_experimental_feature(
            "end-to-end training")

    if stories.is_empty() and nlu_data.contains_no_pure_nlu_data():
        rasa.shared.utils.cli.print_error(
            "No training data given. Please provide stories and NLU data in "
            "order to train a Rasa model using the '--data' argument.")
        return TrainingResult()

    if stories.is_empty():
        rasa.shared.utils.cli.print_warning(
            "No stories present. Just a Rasa NLU model will be trained.")
        trained_model = await _train_nlu_with_validated_data(
            file_importer,
            output=output_path,
            fixed_model_name=fixed_model_name,
            persist_nlu_training_data=persist_nlu_training_data,
            additional_arguments=nlu_additional_arguments,
            model_to_finetune=model_to_finetune,
            finetuning_epoch_fraction=finetuning_epoch_fraction,
        )
        return TrainingResult(model=trained_model)

    # We will train nlu if there are any nlu example, including from e2e stories.
    if nlu_data.contains_no_pure_nlu_data(
    ) and not nlu_data.has_e2e_examples():
        rasa.shared.utils.cli.print_warning(
            "No NLU data present. Just a Rasa Core model will be trained.")
        trained_model = await _train_core_with_validated_data(
            file_importer,
            output=output_path,
            fixed_model_name=fixed_model_name,
            additional_arguments=core_additional_arguments,
            model_to_finetune=model_to_finetune,
            finetuning_epoch_fraction=finetuning_epoch_fraction,
        )

        return TrainingResult(model=trained_model)

    new_fingerprint = await model.model_fingerprint(file_importer)
    old_model = model.get_latest_model(output_path)

    if not force_training:
        fingerprint_comparison = model.should_retrain(
            new_fingerprint,
            old_model,
            train_path,
            has_e2e_examples=nlu_data.has_e2e_examples(),
        )
    else:
        fingerprint_comparison = FingerprintComparisonResult(
            force_training=True)

    if fingerprint_comparison.is_training_required():
        async with telemetry.track_model_training(
                file_importer,
                model_type="rasa",
        ):
            await _do_training(
                file_importer,
                output_path=output_path,
                train_path=train_path,
                fingerprint_comparison_result=fingerprint_comparison,
                fixed_model_name=fixed_model_name,
                persist_nlu_training_data=persist_nlu_training_data,
                core_additional_arguments=core_additional_arguments,
                nlu_additional_arguments=nlu_additional_arguments,
                old_model_zip_path=old_model,
                model_to_finetune=model_to_finetune,
                finetuning_epoch_fraction=finetuning_epoch_fraction,
            )
        trained_model = model.package_model(
            fingerprint=new_fingerprint,
            output_directory=output_path,
            train_path=train_path,
            fixed_model_name=fixed_model_name,
        )
        return TrainingResult(model=trained_model)

    rasa.shared.utils.cli.print_success(
        "Nothing changed. You can use the old model stored at '{}'."
        "".format(os.path.abspath(old_model)))
    return TrainingResult(model=old_model)
def test_nlu_training_data_provider(
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    config_path: Text,
    nlu_data_path: Text,
):
    # create a resource and an importer
    resource = Resource("xy")
    importer = TrainingDataImporter.load_from_config(
        config_path=config_path, training_data_paths=[nlu_data_path])

    # check the default configuration is as expected
    config_1 = NLUTrainingDataProvider.get_default_config()
    assert config_1["language"] is None
    assert config_1["persist"] is False

    # create a provider with persist == True
    provider_1 = NLUTrainingDataProvider.create(
        {
            "language": "en",
            "persist": True
        },
        default_model_storage,
        resource,
        default_execution_context,
    )
    assert isinstance(provider_1, NLUTrainingDataProvider)

    # check the data provided is as expected
    data_0 = provider_1.provide(importer)
    data_1 = importer.get_nlu_data(language="en")
    assert data_0.fingerprint() == data_1.fingerprint()

    # check the data was persisted
    with default_model_storage.read_from(resource) as resource_directory:
        data_file = os.path.join(str(resource_directory),
                                 DEFAULT_TRAINING_DATA_OUTPUT_PATH)
        data = load_data(resource_name=data_file, language="en")
        assert os.path.isfile(data_file)
        assert isinstance(data, TrainingData)

        # delete the persisted data
        os.remove(data_file)
        assert not os.path.isfile(data_file)

    # create a provider with persist == False
    provider_2 = NLUTrainingDataProvider.create(
        {
            "language": "en",
            "persist": False
        },
        default_model_storage,
        resource,
        default_execution_context,
    )
    provider_2.provide(importer)

    # check the data was not persisted
    with default_model_storage.read_from(resource) as resource_directory:
        data_file = os.path.join(str(resource_directory),
                                 DEFAULT_TRAINING_DATA_OUTPUT_PATH)
        assert not os.path.isfile(data_file)
Exemple #25
0
async def test_nlu_comparison(
    tmp_path: Path, monkeypatch: MonkeyPatch, nlu_as_json_path: Text
):
    config = {
        "language": "en",
        "pipeline": [
            {"name": "WhitespaceTokenizer"},
            {"name": "KeywordIntentClassifier"},
            {"name": "RegexEntityExtractor"},
        ],
    }
    # the configs need to be at a different path, otherwise the results are
    # combined on the same dictionary key and cannot be plotted properly
    configs = [write_file_config(config).name, write_file_config(config).name]

    # mock training
    monkeypatch.setattr(Interpreter, "load", Mock(spec=RasaNLUInterpreter))
    monkeypatch.setattr(sys.modules["rasa.nlu"], "train", AsyncMock())
    monkeypatch.setattr(
        sys.modules["rasa.nlu.test"],
        "remove_pretrained_extractors",
        Mock(return_value=None),
    )
    monkeypatch.setattr(
        sys.modules["rasa.nlu.test"],
        "get_eval_data",
        Mock(return_value=(1, None, (None,),)),
    )
    monkeypatch.setattr(
        sys.modules["rasa.nlu.test"],
        "evaluate_intents",
        Mock(return_value={"f1_score": 1}),
    )

    output = str(tmp_path)
    test_data_importer = TrainingDataImporter.load_from_dict(
        training_data_paths=[nlu_as_json_path]
    )
    test_data = await test_data_importer.get_nlu_data()
    await compare_nlu_models(
        configs, test_data, output, runs=2, exclusion_percentages=[50, 80]
    )

    assert set(os.listdir(output)) == {
        "run_1",
        "run_2",
        "results.json",
        "nlu_model_comparison_graph.pdf",
    }

    run_1_path = os.path.join(output, "run_1")
    assert set(os.listdir(run_1_path)) == {"50%_exclusion", "80%_exclusion", "test.md"}

    exclude_50_path = os.path.join(run_1_path, "50%_exclusion")
    modelnames = [os.path.splitext(os.path.basename(config))[0] for config in configs]

    modeloutputs = set(
        ["train"]
        + [f"{m}_report" for m in modelnames]
        + [f"{m}.tar.gz" for m in modelnames]
    )
    assert set(os.listdir(exclude_50_path)) == modeloutputs
Exemple #26
0
async def run_nlu_test_async(
    config: Optional[Union[Text, List[Text]]],
    data_path: Text,
    models_path: Text,
    output_dir: Text,
    cross_validation: bool,
    percentages: List[int],
    runs: int,
    no_errors: bool,
    all_args: Dict[Text, Any],
) -> None:
    """Runs NLU tests.

    Args:
        all_args: all arguments gathered in a Dict so we can pass it as one argument
                  to other functions.
        config: it refers to the model configuration file. It can be a single file or
                a list of multiple files or a folder with multiple config files inside.
        data_path: path for the nlu data.
        models_path: path to a trained Rasa model.
        output_dir: output path for any files created during the evaluation.
        cross_validation: indicates if it should test the model using cross validation
                          or not.
        percentages: defines the exclusion percentage of the training data.
        runs: number of comparison runs to make.
        no_errors: indicates if incorrect predictions should be written to a file
                   or not.
    """
    from rasa.model_testing import (
        compare_nlu_models,
        perform_nlu_cross_validation,
        test_nlu,
    )

    data_path = rasa.cli.utils.get_validated_path(data_path, "nlu",
                                                  DEFAULT_DATA_PATH)
    test_data_importer = TrainingDataImporter.load_from_dict(
        training_data_paths=[data_path])
    nlu_data = await test_data_importer.get_nlu_data()

    output = output_dir or DEFAULT_RESULTS_PATH
    all_args["errors"] = not no_errors
    rasa.shared.utils.io.create_directory(output)

    if config is not None and len(config) == 1:
        config = os.path.abspath(config[0])
        if os.path.isdir(config):
            config = rasa.shared.utils.io.list_files(config)

    if isinstance(config, list):
        logger.info(
            "Multiple configuration files specified, running nlu comparison mode."
        )

        config_files = []
        for file in config:
            try:
                validation_utils.validate_yaml_schema(
                    rasa.shared.utils.io.read_file(file),
                    CONFIG_SCHEMA_FILE,
                )
                config_files.append(file)
            except YamlException:
                rasa.shared.utils.io.raise_warning(
                    f"Ignoring file '{file}' as it is not a valid config file."
                )
                continue
        await compare_nlu_models(
            configs=config_files,
            test_data=nlu_data,
            output=output,
            runs=runs,
            exclusion_percentages=percentages,
        )
    elif cross_validation:
        logger.info("Test model using cross validation.")
        config = rasa.cli.utils.get_validated_path(config, "config",
                                                   DEFAULT_CONFIG_PATH)
        perform_nlu_cross_validation(config, nlu_data, output, all_args)
    else:
        model_path = rasa.cli.utils.get_validated_path(models_path, "model",
                                                       DEFAULT_MODELS_PATH)

        await test_nlu(model_path, data_path, output, all_args)
Exemple #27
0
async def test_import_nlu_training_data_from_e2e_stories(project: Text):
    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
    domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH)
    default_data_path = os.path.join(project, DEFAULT_DATA_PATH)
    importer = TrainingDataImporter.load_from_dict({}, config_path,
                                                   domain_path,
                                                   [default_data_path])

    # The `E2EImporter` correctly wraps the underlying `CombinedDataImporter`
    assert isinstance(importer, E2EImporter)
    importer_without_e2e = importer.importer

    stories = StoryGraph([
        StoryStep(events=[
            SlotSet("some slot", "doesn't matter"),
            UserUttered("greet_from_stories", {"name": "greet_from_stories"}),
            ActionExecuted("utter_greet_from_stories"),
        ]),
        StoryStep(events=[
            UserUttered("how are you doing?"),
            ActionExecuted("utter_greet_from_stories", action_text="Hi Joey."),
        ]),
    ])

    # Patch to return our test stories
    importer_without_e2e.get_stories = asyncio.coroutine(lambda *args: stories)

    # The wrapping `E2EImporter` simply forwards these method calls
    assert (await importer_without_e2e.get_stories()).as_story_string() == (
        await importer.get_stories()).as_story_string()
    assert (await importer_without_e2e.get_config()) == (await
                                                         importer.get_config())

    # Check additional NLU training data from stories was added
    nlu_data = await importer.get_nlu_data()

    # The `E2EImporter` adds NLU training data based on our training stories
    assert len(nlu_data.training_examples) > len(
        (await importer_without_e2e.get_nlu_data()).training_examples)

    # Check if the NLU training data was added correctly from the story training data
    expected_additional_messages = [
        Message(data={
            TEXT: "greet_from_stories",
            INTENT_NAME: "greet_from_stories"
        }),
        Message(data={
            ACTION_NAME: "utter_greet_from_stories",
            ACTION_TEXT: ""
        }),
        Message(data={
            TEXT: "how are you doing?",
            INTENT_NAME: None
        }),
        Message(data={
            ACTION_NAME: "utter_greet_from_stories",
            ACTION_TEXT: "Hi Joey."
        }),
    ]

    assert all(m in nlu_data.training_examples
               for m in expected_additional_messages)
Exemple #28
0
async def _train_async_internal(
    file_importer: TrainingDataImporter,
    train_path: Text,
    output_path: Text,
    force_training: bool,
    fixed_model_name: Optional[Text],
    persist_nlu_training_data: bool,
    core_additional_arguments: Optional[Dict] = None,
    nlu_additional_arguments: Optional[Dict] = None,
) -> Optional[Text]:
    """Trains a Rasa model (Core and NLU). Use only from `train_async`.

    Args:
        file_importer: `TrainingDataImporter` which supplies the training data.
        train_path: Directory in which to train the model.
        output_path: Output path.
        force_training: If `True` retrain model even if data has not changed.
        fixed_model_name: Name of model to be stored.
        persist_nlu_training_data: `True` if the NLU training data should be persisted
                                   with the model.
        core_additional_arguments: Additional training parameters for core training.
        nlu_additional_arguments: Additional training parameters forwarded to training
                                  method of each NLU component.

    Returns:
        Path of the trained model archive.
    """

    stories, nlu_data = await asyncio.gather(file_importer.get_stories(),
                                             file_importer.get_nlu_data())

    if stories.is_empty() and nlu_data.is_empty():
        print_error(
            "No training data given. Please provide stories and NLU data in "
            "order to train a Rasa model using the '--data' argument.")
        return

    if stories.is_empty():
        print_warning(
            "No stories present. Just a Rasa NLU model will be trained.")
        return await _train_nlu_with_validated_data(
            file_importer,
            output=output_path,
            fixed_model_name=fixed_model_name,
            persist_nlu_training_data=persist_nlu_training_data,
            additional_arguments=nlu_additional_arguments,
        )

    if nlu_data.is_empty():
        print_warning(
            "No NLU data present. Just a Rasa Core model will be trained.")
        return await _train_core_with_validated_data(
            file_importer,
            output=output_path,
            fixed_model_name=fixed_model_name,
            additional_arguments=core_additional_arguments,
        )

    new_fingerprint = await model.model_fingerprint(file_importer)
    old_model = model.get_latest_model(output_path)
    fingerprint_comparison = FingerprintComparisonResult(
        force_training=force_training)
    if not force_training:
        fingerprint_comparison = model.should_retrain(new_fingerprint,
                                                      old_model, train_path)

    if fingerprint_comparison.is_training_required():
        await _do_training(
            file_importer,
            output_path=output_path,
            train_path=train_path,
            fingerprint_comparison_result=fingerprint_comparison,
            fixed_model_name=fixed_model_name,
            persist_nlu_training_data=persist_nlu_training_data,
            core_additional_arguments=core_additional_arguments,
            nlu_additional_arguments=nlu_additional_arguments,
            old_model_zip_path=old_model,
        )

        return model.package_model(
            fingerprint=new_fingerprint,
            output_directory=output_path,
            train_path=train_path,
            fixed_model_name=fixed_model_name,
        )

    print_success("Nothing changed. You can use the old model stored at '{}'."
                  "".format(os.path.abspath(old_model)))
    return old_model
Exemple #29
0
def test_loader_loads_graph_runner(
    default_model_storage: ModelStorage,
    temp_cache: TrainingCache,
    tmp_path: Path,
    tmp_path_factory: TempPathFactory,
    domain_path: Path,
):
    graph_trainer = GraphTrainer(
        model_storage=default_model_storage,
        cache=temp_cache,
        graph_runner_class=DaskGraphRunner,
    )

    test_value = "test_value"

    train_schema = GraphSchema(
        {
            "train": SchemaNode(
                needs={},
                uses=PersistableTestComponent,
                fn="train",
                constructor_name="create",
                config={"test_value": test_value},
                is_target=True,
            ),
            "load": SchemaNode(
                needs={"resource": "train"},
                uses=PersistableTestComponent,
                fn="run_inference",
                constructor_name="load",
                config={},
            ),
        }
    )
    predict_schema = GraphSchema(
        {
            "load": SchemaNode(
                needs={},
                uses=PersistableTestComponent,
                fn="run_inference",
                constructor_name="load",
                config={},
                is_target=True,
                resource=Resource("train"),
            )
        }
    )

    output_filename = tmp_path / "model.tar.gz"

    importer = TrainingDataImporter.load_from_dict(
        training_data_paths=[], domain_path=str(domain_path)
    )

    trained_at = datetime.utcnow()
    with freezegun.freeze_time(trained_at):
        model_metadata = graph_trainer.train(
            GraphModelConfiguration(
                train_schema=train_schema,
                predict_schema=predict_schema,
                training_type=TrainingType.BOTH,
                language=None,
                core_target=None,
                nlu_target=None,
            ),
            importer=importer,
            output_filename=output_filename,
        )

    assert isinstance(model_metadata, ModelMetadata)
    assert output_filename.is_file()

    loaded_model_storage_path = tmp_path_factory.mktemp("loaded model storage")

    model_metadata, loaded_predict_graph_runner = loader.load_predict_graph_runner(
        storage_path=loaded_model_storage_path,
        model_archive_path=output_filename,
        model_storage_class=LocalModelStorage,
        graph_runner_class=DaskGraphRunner,
    )

    assert loaded_predict_graph_runner.run() == {"load": test_value}

    assert model_metadata.predict_schema == predict_schema
    assert model_metadata.train_schema == train_schema
    assert model_metadata.model_id
    assert model_metadata.domain.as_dict() == Domain.from_path(domain_path).as_dict()
    assert model_metadata.rasa_open_source_version == rasa.__version__
    assert model_metadata.trained_at == trained_at
Exemple #30
0
def test_graph_trainer_returns_model_metadata(
    default_model_storage: ModelStorage,
    temp_cache: TrainingCache,
    tmp_path: Path,
    domain_path: Path,
):
    graph_trainer = GraphTrainer(
        model_storage=default_model_storage,
        cache=temp_cache,
        graph_runner_class=DaskGraphRunner,
    )

    test_value = "test_value"

    train_schema = GraphSchema(
        {
            "train": SchemaNode(
                needs={},
                uses=PersistableTestComponent,
                fn="train",
                constructor_name="create",
                config={"test_value": test_value},
                is_target=True,
            ),
            "load": SchemaNode(
                needs={"resource": "train"},
                uses=PersistableTestComponent,
                fn="run_inference",
                constructor_name="load",
                config={},
            ),
        }
    )
    predict_schema = GraphSchema(
        {
            "load": SchemaNode(
                needs={},
                uses=PersistableTestComponent,
                fn="run_inference",
                constructor_name="load",
                config={},
                is_target=True,
                resource=Resource("train"),
            )
        }
    )

    output_filename = tmp_path / "model.tar.gz"
    model_metadata = graph_trainer.train(
        GraphModelConfiguration(
            train_schema=train_schema,
            predict_schema=predict_schema,
            language=None,
            core_target=None,
            nlu_target="nlu",
            training_type=TrainingType.BOTH,
        ),
        importer=TrainingDataImporter.load_from_dict(domain_path=str(domain_path)),
        output_filename=output_filename,
    )
    assert model_metadata.model_id
    assert model_metadata.domain.as_dict() == Domain.from_path(domain_path).as_dict()
    assert model_metadata.train_schema == train_schema
    assert model_metadata.predict_schema == predict_schema