async def test_core_only(project: Text): config_path = os.path.join(project, DEFAULT_CONFIG_PATH) domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH) default_data_path = os.path.join(project, DEFAULT_DATA_PATH) actual = TrainingDataImporter.load_core_importer_from_config( config_path, domain_path, training_data_paths=[default_data_path] ) assert isinstance(actual, CoreDataImporter) stories = await actual.get_stories() assert not stories.is_empty() domain = await actual.get_domain() assert not domain.is_empty() config = await actual.get_config() assert config nlu_data = await actual.get_nlu_data() assert nlu_data.is_empty()
def test_example_bot_training_on_initial_project(tmp_path: Path): # we need to test this one separately, as we can't test it in place # configuration suggestions would otherwise change the initial file scaffold.create_initial_project(str(tmp_path)) importer = TrainingDataImporter.load_from_config( str(tmp_path / "config.yml"), str(tmp_path / "domain.yml"), str(tmp_path / "data"), ) with pytest.warns(UserWarning) as record: importer.get_nlu_data() importer.get_stories() # two for slot auto-fill removal assert len(record) == 2 assert ("Slot auto-fill has been removed in 3.0 and replaced with " "a new explicit mechanism to set slots." in record[0].message.args[0]) assert record[0].message.args[0] == record[1].message.args[0]
async def _train_nlu_async( config: Text, nlu_data: Text, output: Text, train_path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, persist_nlu_training_data: bool = False, additional_arguments: Optional[Dict] = None, domain: Optional[Union[Domain, Text]] = None, model_to_finetune: Optional[Text] = None, finetuning_epoch_fraction: float = 1.0, ) -> Optional[Text]: if not nlu_data: rasa.shared.utils.cli.print_error( "No NLU data given. Please provide NLU data in order to train " "a Rasa NLU model using the '--nlu' argument.") return # training NLU only hence the training files still have to be selected file_importer = TrainingDataImporter.load_nlu_importer_from_config( config, domain, training_data_paths=[nlu_data]) training_data = await file_importer.get_nlu_data() if training_data.can_train_nlu_model(): rasa.shared.utils.cli.print_error( f"Path '{nlu_data}' doesn't contain valid NLU data in it. " f"Please verify the data format. " f"The NLU model training will be skipped now.") return return await _train_nlu_with_validated_data( file_importer, output=output, train_path=train_path, fixed_model_name=fixed_model_name, persist_nlu_training_data=persist_nlu_training_data, additional_arguments=additional_arguments, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, )
async def test_eval_data(component_builder, tmpdir, project): _config = RasaNLUModelConfig( { "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "CountVectorsFeaturizer"}, {"name": "DIETClassifier", "epochs": 2}, {"name": "ResponseSelector", "epochs": 2}, ], "language": "en", } ) config_path = os.path.join(project, "config.yml") data_importer = TrainingDataImporter.load_nlu_importer_from_config( config_path, training_data_paths=[ "data/examples/rasa/demo-rasa.md", "data/examples/rasa/demo-rasa-responses.md", ], ) (_, _, persisted_path) = await train( _config, path=tmpdir.strpath, data=data_importer, component_builder=component_builder, persist_nlu_training_data=True, ) interpreter = Interpreter.load(persisted_path, component_builder) data = await data_importer.get_nlu_data() intent_results, response_selection_results, entity_results, = get_eval_data( interpreter, data ) assert len(intent_results) == 46 assert len(response_selection_results) == 46 assert len(entity_results) == 46
def test_number_of_examples_per_intent_with_yaml(tmp_path: Path): domain_path = tmp_path / "domain.yml" domain_path.write_text(Domain.empty().as_yaml()) config_path = tmp_path / "config.yml" config_path.touch() importer = TrainingDataImporter.load_from_dict( {}, str(config_path), str(domain_path), [ "data/test_number_nlu_examples/nlu.yml", "data/test_number_nlu_examples/stories.yml", "data/test_number_nlu_examples/rules.yml", ], ) training_data = importer.get_nlu_data() assert training_data.intents == {"greet", "ask_weather"} assert training_data.number_of_examples_per_intent["greet"] == 2 assert training_data.number_of_examples_per_intent["ask_weather"] == 3
async def _create_data_generator( resource_name: Text, agent: "Agent", max_stories: Optional[int] = None, use_conversation_test_files: bool = False, ) -> "TrainingDataGenerator": from rasa.shared.core.generator import TrainingDataGenerator test_data_importer = TrainingDataImporter.load_from_dict( training_data_paths=[resource_name]) if use_conversation_test_files: story_graph = await test_data_importer.get_conversation_tests() else: story_graph = await test_data_importer.get_stories() return TrainingDataGenerator( story_graph, agent.domain, use_story_concatenation=False, augmentation_factor=0, tracker_limit=max_stories, )
def test_no_warnings_with_default_project(tmp_path: Path): rasa.utils.common.copy_directory(Path("rasa/cli/initial_project"), tmp_path) importer = TrainingDataImporter.load_from_config( config_path=str(tmp_path / "config.yml"), domain_path=str(tmp_path / "domain.yml"), training_data_paths=[str(tmp_path / "data")], ) config, _missing_keys, _configured_keys = DefaultV1Recipe.auto_configure( importer.get_config_file_for_auto_config(), importer.get_config(), TrainingType.END_TO_END, ) graph_config = DefaultV1Recipe().graph_config_for_recipe( config, cli_parameters={}, training_type=TrainingType.END_TO_END) validator = DefaultV1RecipeValidator(graph_config.train_schema) with pytest.warns(None) as records: validator.validate(importer) assert len(records) == 0
def test_should_not_retrain_core(domain_path: Text, tmp_path: Path, stack_config_path: Text): # Don't use `stories_path` as checkpoints currently break fingerprinting story_file = tmp_path / "simple_story.yml" story_file.write_text(""" stories: - story: test_story steps: - intent: greet - action: utter_greet """) trained_model = train_core(domain_path, stack_config_path, str(story_file), str(tmp_path)) importer = TrainingDataImporter.load_from_config( stack_config_path, domain_path, training_data_paths=[str(story_file)]) new_fingerprint = model.model_fingerprint(importer) result = model.should_retrain(new_fingerprint, trained_model, tmp_path) assert not result.should_retrain_core()
async def test_eval_data( tmp_path: Path, project: Text, trained_rasa_model: Text, ): config_path = os.path.join(project, "config.yml") data_importer = TrainingDataImporter.load_nlu_importer_from_config( config_path, training_data_paths=[ "data/examples/rasa/demo-rasa.yml", "data/examples/rasa/demo-rasa-responses.yml", ], ) processor = Agent.load(trained_rasa_model).processor data = data_importer.get_nlu_data() (intent_results, response_selection_results, entity_results) = await get_eval_data( processor, data ) assert len(intent_results) == 46 assert len(response_selection_results) == 46 assert len(entity_results) == 46
def inner( train_schema: GraphSchema, cache: Optional[TrainingCache] = None, model_storage: Optional[ModelStorage] = None, path: Optional[Path] = None, force_retraining: bool = False, ) -> Path: if not path: path = tmp_path_factory.mktemp("model_storage_path") if not model_storage: model_storage = LocalModelStorage.create(path) if not cache: cache = local_cache_creator(path) graph_trainer = GraphTrainer( model_storage=model_storage, cache=cache, graph_runner_class=DaskGraphRunner, ) output_filename = path / "model.tar.gz" graph_trainer.train( GraphModelConfiguration( train_schema=train_schema, predict_schema=GraphSchema({}), language=None, core_target=None, nlu_target="nlu", training_type=TrainingType.BOTH, ), importer=TrainingDataImporter.load_from_dict( domain_path=str(domain_path)), output_filename=output_filename, force_retraining=force_retraining, ) assert output_filename.is_file() return output_filename
def test_core_warn_if_rule_data_unused( policy_type_not_consuming_rule_data: Type[Policy], ): importer = TrainingDataImporter.load_from_dict( domain_path="data/test_moodbot/domain.yml", training_data_paths=[ "data/test_moodbot/data/nlu.yml", "data/test_moodbot/data/rules.yml", ], ) graph_schema = GraphSchema({ "policy": SchemaNode({}, policy_type_not_consuming_rule_data, "", "", {}) }) validator = DefaultV1RecipeValidator(graph_schema) with pytest.warns( UserWarning, match=("Found rule-based training data but no policy " "supporting rule-based data."), ): validator.validate(importer)
def test_core_warn_if_rule_data_missing( policy_type_consuming_rule_data: Type[Policy]): importer = TrainingDataImporter.load_from_dict( domain_path="data/test_e2ebot/domain.yml", training_data_paths=[ "data/test_e2ebot/data/nlu.yml", "data/test_e2ebot/data/stories.yml", ], ) graph_schema = GraphSchema({ "policy": SchemaNode({}, policy_type_consuming_rule_data, "", "", {}) }) validator = DefaultV1RecipeValidator(graph_schema) with pytest.warns( UserWarning, match=("Found a rule-based policy in your configuration " "but no rule-based training data."), ): validator.validate(importer)
def test_nlu_only(project: Text): config_path = os.path.join(project, DEFAULT_CONFIG_PATH) default_data_path = os.path.join(project, DEFAULT_DATA_PATH) actual = TrainingDataImporter.load_nlu_importer_from_config( config_path, training_data_paths=[default_data_path]) assert isinstance(actual, NluDataImporter) assert isinstance(actual._importer, ResponsesSyncImporter) stories = actual.get_stories() assert stories.is_empty() conversation_tests = actual.get_stories() assert conversation_tests.is_empty() domain = actual.get_domain() assert domain.is_empty() config = actual.get_config() assert config nlu_data = actual.get_nlu_data() assert not nlu_data.is_empty()
async def test_nlu_data_domain_sync_with_retrieval_intents(project: Text): config_path = os.path.join(project, DEFAULT_CONFIG_PATH) domain_path = "data/test_domains/default_retrieval_intents.yml" data_paths = [ "data/test_nlu/default_retrieval_intents.md", "data/test_responses/default.md", ] base_data_importer = TrainingDataImporter.load_from_dict({}, config_path, domain_path, data_paths) nlu_importer = NluDataImporter(base_data_importer) core_importer = CoreDataImporter(base_data_importer) importer = RetrievalModelsDataImporter( CombinedDataImporter([nlu_importer, core_importer])) domain = await importer.get_domain() nlu_data = await importer.get_nlu_data() assert domain.retrieval_intents == ["chitchat"] assert domain.intent_properties["chitchat"].get("is_retrieval_intent") assert domain.templates == nlu_data.responses assert "utter_chitchat" in domain.action_names
def interactive(args: argparse.Namespace) -> None: _set_not_required_args(args) file_importer = TrainingDataImporter.load_from_config( args.config, args.domain, args.data if not args.core_only else [args.stories] ) if args.model is None: story_graph = file_importer.get_stories() if not story_graph or story_graph.is_empty(): rasa.shared.utils.cli.print_error_and_exit( "Could not run interactive learning without either core " "data or a model containing core data." ) zipped_model = ( train.run_core_training(args) if args.core_only else train.run_training(args) ) if not zipped_model: rasa.shared.utils.cli.print_error_and_exit( "Could not train an initial model. Either pass paths " "to the relevant training files (`--data`, `--config`, `--domain`), " "or use 'rasa train' to train a model." ) else: zipped_model = get_provided_model(args.model) if not (zipped_model and os.path.exists(zipped_model)): rasa.shared.utils.cli.print_error_and_exit( f"Interactive learning process cannot be started as no " f"initial model was found at path '{args.model}'. " f"Use 'rasa train' to train a model." ) if not args.skip_visualization: logger.info(f"Loading visualization data from {args.data}.") perform_interactive_learning(args, zipped_model, file_importer)
def _create_data_generator( resource_name: Text, agent: "Agent", max_stories: Optional[int] = None, use_conversation_test_files: bool = False, ) -> "TrainingDataGenerator": from rasa.shared.core.generator import TrainingDataGenerator tmp_domain_path = Path(tempfile.mkdtemp()) / "domain.yaml" agent.domain.persist(tmp_domain_path) test_data_importer = TrainingDataImporter.load_from_dict( training_data_paths=[resource_name], domain_path=str(tmp_domain_path)) if use_conversation_test_files: story_graph = test_data_importer.get_conversation_tests() else: story_graph = test_data_importer.get_stories() return TrainingDataGenerator( story_graph, agent.domain, use_story_concatenation=False, augmentation_factor=0, tracker_limit=max_stories, )
async def test_import_nlu_training_data_with_default_actions(project: Text): config_path = os.path.join(project, DEFAULT_CONFIG_PATH) domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH) default_data_path = os.path.join(project, DEFAULT_DATA_PATH) importer = TrainingDataImporter.load_from_dict({}, config_path, domain_path, [default_data_path]) assert isinstance(importer, E2EImporter) importer_without_e2e = importer.importer # Check additional NLU training data from domain was added nlu_data = await importer.get_nlu_data() assert len(nlu_data.training_examples) > len( (await importer_without_e2e.get_nlu_data()).training_examples) extended_training_data = await importer.get_nlu_data() assert all( Message(data={ ACTION_NAME: action_name, ACTION_TEXT: "" }) in extended_training_data.training_examples for action_name in rasa.shared.core.constants.DEFAULT_ACTION_NAMES)
def test_no_warnings_with_default_project(tmp_path: Path): rasa.utils.common.copy_directory(Path("rasa/cli/initial_project"), tmp_path) importer = TrainingDataImporter.load_from_config( config_path=str(tmp_path / "config.yml"), domain_path=str(tmp_path / "domain.yml"), training_data_paths=[str(tmp_path / "data")], ) graph_config = DefaultV1Recipe().graph_config_for_recipe( importer.get_config(), cli_parameters={}, training_type=TrainingType.END_TO_END) validator = DefaultV1RecipeValidator(graph_config.train_schema) with pytest.warns( UserWarning, match="Slot auto-fill has been removed in 3.0") as records: validator.validate(importer) assert all([ warn.message.args[0].startswith("Slot auto-fill has been removed") for warn in records.list ])
def _validate( self, importer: TrainingDataImporter, nlu: bool = True, core: bool = True, ) -> None: """Validate whether the finetuning setting conflicts with other settings. Note that this validation always takes into account the configuration of nlu *and* core part, while the validation of aspects of the domain and the NLU training data only happen if we request to validate finetuning with respect to NLU/Core models, respectively. For more details, see docstring of this class. Args: importer: a training data importer domain: the domain nlu: set to `False` if NLU part should not be validated core: set to `False` if Core part should not be validated Raises: `InvalidConfigException` if there is a conflict """ if self._is_finetuning and not self._fingerprints: raise InvalidConfigException( f"Finetuning is enabled but the {self.__class__.__name__} " f"does not remember seeing a training run. Ensure that you have " f"trained your model at least once (with finetuning disabled) " f"and ensure that the {self.__class__.__name__} is part of the " f"training graph. ") rasa_version = rasa.__version__ if self._is_finetuning: old_rasa_version = self._fingerprints[FINGERPRINT_VERSION] if version.parse(old_rasa_version) < version.parse( MINIMUM_COMPATIBLE_VERSION): raise InvalidConfigException( f"The minimum compatible Rasa Version is " f"{MINIMUM_COMPATIBLE_VERSION} but the model we attempt to " f"finetune has been generated with an older version " f"({old_rasa_version}.") self._fingerprints[FINGERPRINT_VERSION] = rasa_version config = importer.get_config() self._compare_or_memorize( fingerprint_key=FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY, new_fingerprint=self._get_fingerprint_of_config_without_epochs( config), error_message= ("Cannot finetune because more than just the 'epoch' keys have been " "changed in the configuration. " "Please revert your configuration and only change " "the 'epoch' settings where needed."), ) if core: # NOTE: If there's a consistency check between domain and core training data # that ensures domain and core training data are consistent, then we can # drop this check. domain = importer.get_domain() self._compare_or_memorize( fingerprint_key=FINGERPRINT_CORE, new_fingerprint=self. _get_fingerprint_of_domain_without_responses(domain), error_message= ("Cannot finetune because more than just the responses have been " "changed in the domain." "Please revert all settings in your domain file (except the " "'responses')."), ) if nlu: nlu_data = importer.get_nlu_data() self._compare_or_memorize( fingerprint_key=FINGERPRINT_NLU, new_fingerprint=nlu_data.label_fingerprint(), error_message= ("Cannot finetune because NLU training data contains new labels " "or does not contain any examples for some known labels. " "Please make sure that the NLU data that you use " "for finetuning contains at least one example for every label " "(i.e. intent, action name, ...) that was included in the NLU " "data used for training the model which we attempt to finetune " "now. Moreover, you must not add labels that were not included " "during training before. "), ) self.persist()
async def train_async( domain: Union[Domain, Text], config: Text, training_files: Optional[Union[Text, List[Text]]], output: Text = DEFAULT_MODELS_PATH, dry_run: bool = False, force_training: bool = False, fixed_model_name: Optional[Text] = None, persist_nlu_training_data: bool = False, core_additional_arguments: Optional[Dict] = None, nlu_additional_arguments: Optional[Dict] = None, model_to_finetune: Optional[Text] = None, finetuning_epoch_fraction: float = 1.0, ) -> TrainingResult: """Trains a Rasa model (Core and NLU). Args: domain: Path to the domain file. config: Path to the config for Core and NLU. training_files: Paths to the training data for Core and NLU. output_path: Output path. dry_run: If `True` then no training will be done, and the information about whether the training needs to be done will be printed. force_training: If `True` retrain model even if data has not changed. fixed_model_name: Name of model to be stored. persist_nlu_training_data: `True` if the NLU training data should be persisted with the model. core_additional_arguments: Additional training parameters for core training. nlu_additional_arguments: Additional training parameters forwarded to training method of each NLU component. model_to_finetune: Optional path to a model which should be finetuned or a directory in case the latest trained model should be used. finetuning_epoch_fraction: The fraction currently specified training epochs in the model configuration which should be used for finetuning. Returns: An instance of `TrainingResult`. """ file_importer = TrainingDataImporter.load_from_config( config, domain, training_files) with TempDirectoryPath(tempfile.mkdtemp()) as train_path: domain = await file_importer.get_domain() if domain.is_empty(): nlu_model = await handle_domain_if_not_exists( file_importer, output, fixed_model_name) return TrainingResult(model=nlu_model) return await _train_async_internal( file_importer, train_path, output, dry_run, force_training, fixed_model_name, persist_nlu_training_data, core_additional_arguments=core_additional_arguments, nlu_additional_arguments=nlu_additional_arguments, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, )
async def _train_core_with_validated_data( file_importer: TrainingDataImporter, output: Text, train_path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, additional_arguments: Optional[Dict] = None, interpreter: Optional[Interpreter] = None, model_to_finetune: Optional["Text"] = None, finetuning_epoch_fraction: float = 1.0, ) -> Optional[Text]: """Train Core with validated training and config data.""" import rasa.core.train with ExitStack() as stack: if train_path: # If the train path was provided, do nothing on exit. _train_path = train_path else: # Otherwise, create a temp train path and clean it up on exit. _train_path = stack.enter_context( TempDirectoryPath(tempfile.mkdtemp())) # normal (not compare) training rasa.shared.utils.cli.print_color( "Training Core model...", color=rasa.shared.utils.io.bcolors.OKBLUE) domain, config = await asyncio.gather(file_importer.get_domain(), file_importer.get_config()) if model_to_finetune: rasa.shared.utils.common.mark_as_experimental_feature( "Incremental Training feature") model_to_finetune = await _core_model_for_finetuning( model_to_finetune, file_importer=file_importer, finetuning_epoch_fraction=finetuning_epoch_fraction, ) if not model_to_finetune: rasa.shared.utils.cli.print_error_and_exit( f"No Core model for finetuning found. Please make sure to either " f"specify a path to a previous model or to have a finetunable " f"model within the directory '{output}'.") async with telemetry.track_model_training( file_importer, model_type="core", is_finetuning=model_to_finetune is not None, ): await rasa.core.train.train( domain_file=domain, training_resource=file_importer, output_path=os.path.join(_train_path, DEFAULT_CORE_SUBDIRECTORY_NAME), policy_config=config, additional_arguments=additional_arguments, interpreter=interpreter, model_to_finetune=model_to_finetune, ) rasa.shared.utils.cli.print_color( "Core model training completed.", color=rasa.shared.utils.io.bcolors.OKBLUE) if train_path is None: # Only Core was trained. new_fingerprint = await model.model_fingerprint(file_importer) return model.package_model( fingerprint=new_fingerprint, output_directory=output, train_path=_train_path, fixed_model_name=fixed_model_name, model_prefix="core-", ) return _train_path
async def train_core_async( domain: Union[Domain, Text], config: Text, stories: Text, output: Text, train_path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, additional_arguments: Optional[Dict] = None, model_to_finetune: Optional[Text] = None, finetuning_epoch_fraction: float = 1.0, ) -> Optional[Text]: """Trains a Core model. Args: domain: Path to the domain file. config: Path to the config file for Core. stories: Path to the Core training data. output: Output path. train_path: If `None` the model will be trained in a temporary directory, otherwise in the provided directory. fixed_model_name: Name of model to be stored. additional_arguments: Additional training parameters. model_to_finetune: Optional path to a model which should be finetuned or a directory in case the latest trained model should be used. finetuning_epoch_fraction: The fraction currently specified training epochs in the model configuration which should be used for finetuning. Returns: If `train_path` is given it returns the path to the model archive, otherwise the path to the directory with the trained model files. """ file_importer = TrainingDataImporter.load_core_importer_from_config( config, domain, [stories]) stories, nlu_data, domain = await asyncio.gather( file_importer.get_stories(), file_importer.get_nlu_data(), file_importer.get_domain(), ) if nlu_data.has_e2e_examples(): rasa.shared.utils.cli.print_error( "Stories file contains e2e stories. Please train using `rasa train` so that" " the NLU model is also trained.") return None if domain.is_empty(): rasa.shared.utils.cli.print_error( "Core training was skipped because no valid domain file was found. " "Please specify a valid domain using '--domain' argument or check " "if the provided domain file exists.") return None if not stories: rasa.shared.utils.cli.print_error( "No stories given. Please provide stories in order to " "train a Rasa Core model using the '--stories' argument.") return return await _train_core_with_validated_data( file_importer, output=output, train_path=train_path, fixed_model_name=fixed_model_name, additional_arguments=additional_arguments, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, )
async def _train_async_internal( file_importer: TrainingDataImporter, train_path: Text, output_path: Text, dry_run: bool, force_training: bool, fixed_model_name: Optional[Text], persist_nlu_training_data: bool, core_additional_arguments: Optional[Dict] = None, nlu_additional_arguments: Optional[Dict] = None, model_to_finetune: Optional[Text] = None, finetuning_epoch_fraction: float = 1.0, ) -> TrainingResult: """Trains a Rasa model (Core and NLU). Use only from `train_async`. Args: file_importer: `TrainingDataImporter` which supplies the training data. train_path: Directory in which to train the model. output_path: Output path. dry_run: If `True` then no training will be done, and the information about whether the training needs to be done will be printed. force_training: If `True` retrain model even if data has not changed. fixed_model_name: Name of model to be stored. persist_nlu_training_data: `True` if the NLU training data should be persisted with the model. core_additional_arguments: Additional training parameters for core training. nlu_additional_arguments: Additional training parameters forwarded to training method of each NLU component. model_to_finetune: Optional path to a model which should be finetuned or a directory in case the latest trained model should be used. finetuning_epoch_fraction: The fraction currently specified training epochs in the model configuration which should be used for finetuning. Returns: An instance of `TrainingResult`. """ stories, nlu_data = await asyncio.gather(file_importer.get_stories(), file_importer.get_nlu_data()) new_fingerprint = await model.model_fingerprint(file_importer) old_model = model.get_latest_model(output_path) fingerprint_comparison = model.should_retrain( new_fingerprint, old_model, train_path, force_training=force_training) if dry_run: code, texts = dry_run_result(fingerprint_comparison) for text in texts: print_warning(text) if code > 0 else print_success(text) return TrainingResult(code=code) if nlu_data.has_e2e_examples(): rasa.shared.utils.common.mark_as_experimental_feature( "end-to-end training") if stories.is_empty() and nlu_data.contains_no_pure_nlu_data(): rasa.shared.utils.cli.print_error( "No training data given. Please provide stories and NLU data in " "order to train a Rasa model using the '--data' argument.") return TrainingResult() if stories.is_empty(): rasa.shared.utils.cli.print_warning( "No stories present. Just a Rasa NLU model will be trained.") trained_model = await _train_nlu_with_validated_data( file_importer, output=output_path, fixed_model_name=fixed_model_name, persist_nlu_training_data=persist_nlu_training_data, additional_arguments=nlu_additional_arguments, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, ) return TrainingResult(model=trained_model) # We will train nlu if there are any nlu example, including from e2e stories. if nlu_data.contains_no_pure_nlu_data( ) and not nlu_data.has_e2e_examples(): rasa.shared.utils.cli.print_warning( "No NLU data present. Just a Rasa Core model will be trained.") trained_model = await _train_core_with_validated_data( file_importer, output=output_path, fixed_model_name=fixed_model_name, additional_arguments=core_additional_arguments, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, ) return TrainingResult(model=trained_model) new_fingerprint = await model.model_fingerprint(file_importer) old_model = model.get_latest_model(output_path) if not force_training: fingerprint_comparison = model.should_retrain( new_fingerprint, old_model, train_path, has_e2e_examples=nlu_data.has_e2e_examples(), ) else: fingerprint_comparison = FingerprintComparisonResult( force_training=True) if fingerprint_comparison.is_training_required(): async with telemetry.track_model_training( file_importer, model_type="rasa", ): await _do_training( file_importer, output_path=output_path, train_path=train_path, fingerprint_comparison_result=fingerprint_comparison, fixed_model_name=fixed_model_name, persist_nlu_training_data=persist_nlu_training_data, core_additional_arguments=core_additional_arguments, nlu_additional_arguments=nlu_additional_arguments, old_model_zip_path=old_model, model_to_finetune=model_to_finetune, finetuning_epoch_fraction=finetuning_epoch_fraction, ) trained_model = model.package_model( fingerprint=new_fingerprint, output_directory=output_path, train_path=train_path, fixed_model_name=fixed_model_name, ) return TrainingResult(model=trained_model) rasa.shared.utils.cli.print_success( "Nothing changed. You can use the old model stored at '{}'." "".format(os.path.abspath(old_model))) return TrainingResult(model=old_model)
def test_nlu_training_data_provider( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, config_path: Text, nlu_data_path: Text, ): # create a resource and an importer resource = Resource("xy") importer = TrainingDataImporter.load_from_config( config_path=config_path, training_data_paths=[nlu_data_path]) # check the default configuration is as expected config_1 = NLUTrainingDataProvider.get_default_config() assert config_1["language"] is None assert config_1["persist"] is False # create a provider with persist == True provider_1 = NLUTrainingDataProvider.create( { "language": "en", "persist": True }, default_model_storage, resource, default_execution_context, ) assert isinstance(provider_1, NLUTrainingDataProvider) # check the data provided is as expected data_0 = provider_1.provide(importer) data_1 = importer.get_nlu_data(language="en") assert data_0.fingerprint() == data_1.fingerprint() # check the data was persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) data = load_data(resource_name=data_file, language="en") assert os.path.isfile(data_file) assert isinstance(data, TrainingData) # delete the persisted data os.remove(data_file) assert not os.path.isfile(data_file) # create a provider with persist == False provider_2 = NLUTrainingDataProvider.create( { "language": "en", "persist": False }, default_model_storage, resource, default_execution_context, ) provider_2.provide(importer) # check the data was not persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) assert not os.path.isfile(data_file)
async def test_nlu_comparison( tmp_path: Path, monkeypatch: MonkeyPatch, nlu_as_json_path: Text ): config = { "language": "en", "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "KeywordIntentClassifier"}, {"name": "RegexEntityExtractor"}, ], } # the configs need to be at a different path, otherwise the results are # combined on the same dictionary key and cannot be plotted properly configs = [write_file_config(config).name, write_file_config(config).name] # mock training monkeypatch.setattr(Interpreter, "load", Mock(spec=RasaNLUInterpreter)) monkeypatch.setattr(sys.modules["rasa.nlu"], "train", AsyncMock()) monkeypatch.setattr( sys.modules["rasa.nlu.test"], "remove_pretrained_extractors", Mock(return_value=None), ) monkeypatch.setattr( sys.modules["rasa.nlu.test"], "get_eval_data", Mock(return_value=(1, None, (None,),)), ) monkeypatch.setattr( sys.modules["rasa.nlu.test"], "evaluate_intents", Mock(return_value={"f1_score": 1}), ) output = str(tmp_path) test_data_importer = TrainingDataImporter.load_from_dict( training_data_paths=[nlu_as_json_path] ) test_data = await test_data_importer.get_nlu_data() await compare_nlu_models( configs, test_data, output, runs=2, exclusion_percentages=[50, 80] ) assert set(os.listdir(output)) == { "run_1", "run_2", "results.json", "nlu_model_comparison_graph.pdf", } run_1_path = os.path.join(output, "run_1") assert set(os.listdir(run_1_path)) == {"50%_exclusion", "80%_exclusion", "test.md"} exclude_50_path = os.path.join(run_1_path, "50%_exclusion") modelnames = [os.path.splitext(os.path.basename(config))[0] for config in configs] modeloutputs = set( ["train"] + [f"{m}_report" for m in modelnames] + [f"{m}.tar.gz" for m in modelnames] ) assert set(os.listdir(exclude_50_path)) == modeloutputs
async def run_nlu_test_async( config: Optional[Union[Text, List[Text]]], data_path: Text, models_path: Text, output_dir: Text, cross_validation: bool, percentages: List[int], runs: int, no_errors: bool, all_args: Dict[Text, Any], ) -> None: """Runs NLU tests. Args: all_args: all arguments gathered in a Dict so we can pass it as one argument to other functions. config: it refers to the model configuration file. It can be a single file or a list of multiple files or a folder with multiple config files inside. data_path: path for the nlu data. models_path: path to a trained Rasa model. output_dir: output path for any files created during the evaluation. cross_validation: indicates if it should test the model using cross validation or not. percentages: defines the exclusion percentage of the training data. runs: number of comparison runs to make. no_errors: indicates if incorrect predictions should be written to a file or not. """ from rasa.model_testing import ( compare_nlu_models, perform_nlu_cross_validation, test_nlu, ) data_path = rasa.cli.utils.get_validated_path(data_path, "nlu", DEFAULT_DATA_PATH) test_data_importer = TrainingDataImporter.load_from_dict( training_data_paths=[data_path]) nlu_data = await test_data_importer.get_nlu_data() output = output_dir or DEFAULT_RESULTS_PATH all_args["errors"] = not no_errors rasa.shared.utils.io.create_directory(output) if config is not None and len(config) == 1: config = os.path.abspath(config[0]) if os.path.isdir(config): config = rasa.shared.utils.io.list_files(config) if isinstance(config, list): logger.info( "Multiple configuration files specified, running nlu comparison mode." ) config_files = [] for file in config: try: validation_utils.validate_yaml_schema( rasa.shared.utils.io.read_file(file), CONFIG_SCHEMA_FILE, ) config_files.append(file) except YamlException: rasa.shared.utils.io.raise_warning( f"Ignoring file '{file}' as it is not a valid config file." ) continue await compare_nlu_models( configs=config_files, test_data=nlu_data, output=output, runs=runs, exclusion_percentages=percentages, ) elif cross_validation: logger.info("Test model using cross validation.") config = rasa.cli.utils.get_validated_path(config, "config", DEFAULT_CONFIG_PATH) perform_nlu_cross_validation(config, nlu_data, output, all_args) else: model_path = rasa.cli.utils.get_validated_path(models_path, "model", DEFAULT_MODELS_PATH) await test_nlu(model_path, data_path, output, all_args)
async def test_import_nlu_training_data_from_e2e_stories(project: Text): config_path = os.path.join(project, DEFAULT_CONFIG_PATH) domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH) default_data_path = os.path.join(project, DEFAULT_DATA_PATH) importer = TrainingDataImporter.load_from_dict({}, config_path, domain_path, [default_data_path]) # The `E2EImporter` correctly wraps the underlying `CombinedDataImporter` assert isinstance(importer, E2EImporter) importer_without_e2e = importer.importer stories = StoryGraph([ StoryStep(events=[ SlotSet("some slot", "doesn't matter"), UserUttered("greet_from_stories", {"name": "greet_from_stories"}), ActionExecuted("utter_greet_from_stories"), ]), StoryStep(events=[ UserUttered("how are you doing?"), ActionExecuted("utter_greet_from_stories", action_text="Hi Joey."), ]), ]) # Patch to return our test stories importer_without_e2e.get_stories = asyncio.coroutine(lambda *args: stories) # The wrapping `E2EImporter` simply forwards these method calls assert (await importer_without_e2e.get_stories()).as_story_string() == ( await importer.get_stories()).as_story_string() assert (await importer_without_e2e.get_config()) == (await importer.get_config()) # Check additional NLU training data from stories was added nlu_data = await importer.get_nlu_data() # The `E2EImporter` adds NLU training data based on our training stories assert len(nlu_data.training_examples) > len( (await importer_without_e2e.get_nlu_data()).training_examples) # Check if the NLU training data was added correctly from the story training data expected_additional_messages = [ Message(data={ TEXT: "greet_from_stories", INTENT_NAME: "greet_from_stories" }), Message(data={ ACTION_NAME: "utter_greet_from_stories", ACTION_TEXT: "" }), Message(data={ TEXT: "how are you doing?", INTENT_NAME: None }), Message(data={ ACTION_NAME: "utter_greet_from_stories", ACTION_TEXT: "Hi Joey." }), ] assert all(m in nlu_data.training_examples for m in expected_additional_messages)
async def _train_async_internal( file_importer: TrainingDataImporter, train_path: Text, output_path: Text, force_training: bool, fixed_model_name: Optional[Text], persist_nlu_training_data: bool, core_additional_arguments: Optional[Dict] = None, nlu_additional_arguments: Optional[Dict] = None, ) -> Optional[Text]: """Trains a Rasa model (Core and NLU). Use only from `train_async`. Args: file_importer: `TrainingDataImporter` which supplies the training data. train_path: Directory in which to train the model. output_path: Output path. force_training: If `True` retrain model even if data has not changed. fixed_model_name: Name of model to be stored. persist_nlu_training_data: `True` if the NLU training data should be persisted with the model. core_additional_arguments: Additional training parameters for core training. nlu_additional_arguments: Additional training parameters forwarded to training method of each NLU component. Returns: Path of the trained model archive. """ stories, nlu_data = await asyncio.gather(file_importer.get_stories(), file_importer.get_nlu_data()) if stories.is_empty() and nlu_data.is_empty(): print_error( "No training data given. Please provide stories and NLU data in " "order to train a Rasa model using the '--data' argument.") return if stories.is_empty(): print_warning( "No stories present. Just a Rasa NLU model will be trained.") return await _train_nlu_with_validated_data( file_importer, output=output_path, fixed_model_name=fixed_model_name, persist_nlu_training_data=persist_nlu_training_data, additional_arguments=nlu_additional_arguments, ) if nlu_data.is_empty(): print_warning( "No NLU data present. Just a Rasa Core model will be trained.") return await _train_core_with_validated_data( file_importer, output=output_path, fixed_model_name=fixed_model_name, additional_arguments=core_additional_arguments, ) new_fingerprint = await model.model_fingerprint(file_importer) old_model = model.get_latest_model(output_path) fingerprint_comparison = FingerprintComparisonResult( force_training=force_training) if not force_training: fingerprint_comparison = model.should_retrain(new_fingerprint, old_model, train_path) if fingerprint_comparison.is_training_required(): await _do_training( file_importer, output_path=output_path, train_path=train_path, fingerprint_comparison_result=fingerprint_comparison, fixed_model_name=fixed_model_name, persist_nlu_training_data=persist_nlu_training_data, core_additional_arguments=core_additional_arguments, nlu_additional_arguments=nlu_additional_arguments, old_model_zip_path=old_model, ) return model.package_model( fingerprint=new_fingerprint, output_directory=output_path, train_path=train_path, fixed_model_name=fixed_model_name, ) print_success("Nothing changed. You can use the old model stored at '{}'." "".format(os.path.abspath(old_model))) return old_model
def test_loader_loads_graph_runner( default_model_storage: ModelStorage, temp_cache: TrainingCache, tmp_path: Path, tmp_path_factory: TempPathFactory, domain_path: Path, ): graph_trainer = GraphTrainer( model_storage=default_model_storage, cache=temp_cache, graph_runner_class=DaskGraphRunner, ) test_value = "test_value" train_schema = GraphSchema( { "train": SchemaNode( needs={}, uses=PersistableTestComponent, fn="train", constructor_name="create", config={"test_value": test_value}, is_target=True, ), "load": SchemaNode( needs={"resource": "train"}, uses=PersistableTestComponent, fn="run_inference", constructor_name="load", config={}, ), } ) predict_schema = GraphSchema( { "load": SchemaNode( needs={}, uses=PersistableTestComponent, fn="run_inference", constructor_name="load", config={}, is_target=True, resource=Resource("train"), ) } ) output_filename = tmp_path / "model.tar.gz" importer = TrainingDataImporter.load_from_dict( training_data_paths=[], domain_path=str(domain_path) ) trained_at = datetime.utcnow() with freezegun.freeze_time(trained_at): model_metadata = graph_trainer.train( GraphModelConfiguration( train_schema=train_schema, predict_schema=predict_schema, training_type=TrainingType.BOTH, language=None, core_target=None, nlu_target=None, ), importer=importer, output_filename=output_filename, ) assert isinstance(model_metadata, ModelMetadata) assert output_filename.is_file() loaded_model_storage_path = tmp_path_factory.mktemp("loaded model storage") model_metadata, loaded_predict_graph_runner = loader.load_predict_graph_runner( storage_path=loaded_model_storage_path, model_archive_path=output_filename, model_storage_class=LocalModelStorage, graph_runner_class=DaskGraphRunner, ) assert loaded_predict_graph_runner.run() == {"load": test_value} assert model_metadata.predict_schema == predict_schema assert model_metadata.train_schema == train_schema assert model_metadata.model_id assert model_metadata.domain.as_dict() == Domain.from_path(domain_path).as_dict() assert model_metadata.rasa_open_source_version == rasa.__version__ assert model_metadata.trained_at == trained_at
def test_graph_trainer_returns_model_metadata( default_model_storage: ModelStorage, temp_cache: TrainingCache, tmp_path: Path, domain_path: Path, ): graph_trainer = GraphTrainer( model_storage=default_model_storage, cache=temp_cache, graph_runner_class=DaskGraphRunner, ) test_value = "test_value" train_schema = GraphSchema( { "train": SchemaNode( needs={}, uses=PersistableTestComponent, fn="train", constructor_name="create", config={"test_value": test_value}, is_target=True, ), "load": SchemaNode( needs={"resource": "train"}, uses=PersistableTestComponent, fn="run_inference", constructor_name="load", config={}, ), } ) predict_schema = GraphSchema( { "load": SchemaNode( needs={}, uses=PersistableTestComponent, fn="run_inference", constructor_name="load", config={}, is_target=True, resource=Resource("train"), ) } ) output_filename = tmp_path / "model.tar.gz" model_metadata = graph_trainer.train( GraphModelConfiguration( train_schema=train_schema, predict_schema=predict_schema, language=None, core_target=None, nlu_target="nlu", training_type=TrainingType.BOTH, ), importer=TrainingDataImporter.load_from_dict(domain_path=str(domain_path)), output_filename=output_filename, ) assert model_metadata.model_id assert model_metadata.domain.as_dict() == Domain.from_path(domain_path).as_dict() assert model_metadata.train_schema == train_schema assert model_metadata.predict_schema == predict_schema