async def train(request): # if set will not generate a model name but use the passed one model_name = request.args.get("model", None) try: model_config, data_dict = extract_data_and_config(request) except Exception as e: logger.debug(traceback.format_exc()) raise ErrorResponse( 500, "ServerError", "An unexpected error occurred.", details={"error": str(e)}, ) data_file = dump_to_data_file(data_dict) config_file = dump_to_data_file(model_config, "_config") try: path_to_model = await data_router.start_train_process( data_file, RasaNLUModelConfig(model_config), model_name) # store trained model as tar.gz file output_path = create_model_path(model_name, path_to_model) nlu_data = data.get_nlu_directory(data_file) new_fingerprint = model.model_fingerprint(config_file, nlu_data=nlu_data) model.create_package_rasa(path_to_model, output_path, new_fingerprint) logger.info("Rasa NLU model trained and persisted to '{}'.".format( output_path)) await data_router.load_model(output_path) return await response.file(output_path) except MaxWorkerProcessError as e: raise ErrorResponse( 403, "NoFreeProcess", "No process available for training.", details={"error": str(e)}, ) except InvalidModelError as e: raise ErrorResponse( 404, "ModelNotFound", "Model '{}' not found.".format(model_name), details={"error": str(e)}, ) except TrainingException as e: logger.debug(traceback.format_exc()) raise ErrorResponse( 500, "ServerError", "An unexpected error occurred.", details={"error": str(e)}, )
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def duckling_interpreter(component_builder, tmpdir_factory): conf = RasaNLUModelConfig( {"pipeline": [{"name": "DucklingHTTPExtractor"}]} ) return utilities.interpreter_for( component_builder, data="./data/examples/rasa/demo-rasa.json", path=tmpdir_factory.mktemp("projects").strpath, config=conf)
def test_spacy_featurizer(sentence, spacy_nlp): ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) doc = spacy_nlp(sentence) vecs = ftr._features_for_doc(doc) expected = [t.vector for t in doc] assert np.allclose(vecs, expected, atol=1e-5)
def create_component_from_class(self, component_class: Type[C], **cfg: Any) -> C: """Create a component based on a class and a configuration. Mainly used to make use of caching when instantiating component classes.""" component_config = {"name": component_class.name} return self.create_component(component_config, RasaNLUModelConfig(cfg))
def test_run_cv_evaluation_with_response_selector(): training_data_obj = training_data.load_data( "data/examples/rasa/demo-rasa.md") training_data_responses_obj = training_data.load_data( "data/examples/rasa/demo-rasa-responses.md") training_data_obj = training_data_obj.merge(training_data_responses_obj) training_data_obj.fill_response_phrases() nlu_config = RasaNLUModelConfig({ "language": "en", "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 2 }, { "name": "ResponseSelector", EPOCHS: 2 }, ], }) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( training_data_obj, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert len(response_selection_results.train["Accuracy"]) == n_folds assert len(response_selection_results.train["Precision"]) == n_folds assert len(response_selection_results.train["F1-score"]) == n_folds assert len(response_selection_results.test["Accuracy"]) == n_folds assert len(response_selection_results.test["Precision"]) == n_folds assert len(response_selection_results.test["F1-score"]) == n_folds assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor) loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def test_whitespace_language_suuport(language, error, component_builder): config = RasaNLUModelConfig( {"language": language, "pipeline": [{"name": "WhitespaceTokenizer"}]} ) if error: with pytest.raises(UnsupportedLanguageError): component_builder.create_component({"name": "WhitespaceTokenizer"}, config) else: component_builder.create_component({"name": "WhitespaceTokenizer"}, config)
def trained_nlu_model(): cfg = RasaNLUModelConfig({"pipeline": "keyword"}) trainer = Trainer(cfg) td = training_data.load_data(DEFAULT_DATA_PATH) trainer.train(td) model_path = trainer.persist("test_models", project_name="test_model_keyword") return model_path
def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) persisted_path = trainer.persist(tmpdir.strpath) loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("Rasa is great!") is not None
def test_spacy_featurizer(sentence, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer ftr = SpacyFeaturizer.create({"return_sequence": True}, RasaNLUModelConfig()) doc = spacy_nlp(sentence) vecs = ftr._features_for_doc(doc) expected = [t.vector for t in doc] assert np.allclose(vecs, expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer mitie_component_config = {"name": "MitieFeaturizer"} ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig()) sentence = "Hey how are you today" tokens = MitieTokenizer().tokenize(sentence) vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) trainer.train(TrainingData()) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor, project_name="my_project") loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def test_run_cv_evaluation( pretrained_embeddings_spacy_config: RasaNLUModelConfig, monkeypatch: MonkeyPatch): td = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.json") nlu_config = RasaNLUModelConfig({ "language": "en", "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 2 }, ], }) # mock training trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) mock = Mock(return_value=Interpreter(trainer.pipeline, None)) monkeypatch.setattr(Trainer, "train", mock) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( td, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, report_as_dict=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert all(key in intent_results.evaluation for key in ["errors", "report"]) assert any( isinstance(intent_report, dict) and intent_report.get("confused_with") is not None for intent_report in intent_results.evaluation["report"].values()) for extractor_evaluation in entity_results.evaluation.values(): assert all(key in extractor_evaluation for key in ["errors", "report"])
def pretrained_embeddings_convert_config() -> RasaNLUModelConfig: return RasaNLUModelConfig( { "language": "en", "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}, {"name": "DIETClassifier", EPOCHS: 1, RANDOM_SEED: 42}, ], } )
def test_convert_featurizer_train(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_train_model_noents(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data="./data/test/demo-rasa-noents.json", component_builder=component_builder) assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
async def test_validate_component_keys_raises_warning_on_invalid_key(tmp_path: Path,): _config = RasaNLUModelConfig( # config with a component that does not have a `confidence_threshold ` property {"pipeline": [{"name": "WhitespaceTokenizer", "confidence_threshold": 0.7}]} ) with pytest.warns(UserWarning) as record: await train( _config, data=DEFAULT_DATA_PATH, path=str(tmp_path), ) assert "You have provided an invalid key" in record[0].message.args[0]
async def test_train_model_checkpointing( component_builder: ComponentBuilder, tmpdir: Path ): from pathlib import Path model_name = "rs-checkpointed-model" best_model_file = Path(str(tmpdir), model_name) assert not best_model_file.exists() _config = RasaNLUModelConfig( { "pipeline": [ {"name": "WhitespaceTokenizer"}, { "name": "CountVectorsFeaturizer", "analyzer": "char_wb", "min_ngram": 3, "max_ngram": 17, "max_features": 10, "min_df": 5, }, { "name": "ResponseSelector", EPOCHS: 5, MODEL_CONFIDENCE: "linear_norm", CONSTRAIN_SIMILARITIES: True, CHECKPOINT_MODEL: True, }, ], "language": "en", } ) await rasa.nlu.train.train( _config, path=str(tmpdir), data="data/test_selectors", component_builder=component_builder, fixed_model_name=model_name, ) assert best_model_file.exists() """ Tricky to validate the *exact* number of files that should be there, however there must be at least the following: - metadata.json - checkpoint - component_1_CountVectorsFeaturizer (as per the pipeline above) - component_2_ResponseSelector files (more than 1 file) """ all_files = list(best_model_file.rglob("*.*")) assert len(all_files) > 4
async def test_validate_requirements_raises_exception_on_component_without_name( tmp_path: Path, ): _config = RasaNLUModelConfig( # config with a component that does not have a `name` property {"pipeline": [{"parameter": 4}]} ) with pytest.raises(InvalidConfigException): await train( _config, data=DEFAULT_DATA_PATH, path=str(tmp_path), )
async def train(self): """Train the engine. """ nltk.download('punkt') lang = self.config['language'] if not os.path.exists('data/' + self.config['skill-id']): _LOGGER.info("Starting Skill training.") _LOGGER.info("Generating stories.") data, domain_data, stories = await GenerateStories.run( self.config['skill-id'], self.config['language'], self.asm) training_data = TrainingData(training_examples=data) nlu_config = RasaNLUModelConfig({ "language": lang, "pipeline": self.config['pipeline'], "data": None }) trainer = Trainer(nlu_config, None, True) _LOGGER.info("Training Arcus NLU") trainer.train(training_data) trainer.persist("data/" + self.config['skill-id'], None, 'nlu') # Rasa core domain = Domain.from_dict(domain_data) reader = StoryFileReader(domain, RegexInterpreter(), None, False) story_steps = await reader.process_lines(stories) graph = StoryGraph(story_steps) g = TrainingDataGenerator( graph, domain, remove_duplicates=True, unique_last_num_states=None, augmentation_factor=20, tracker_limit=None, use_story_concatenation=True, debug_plots=False, ) training_trackers = g.generate() policy_list = SimplePolicyEnsemble.from_dict( {"policies": self.config['policies']}) policy_ensemble = SimplePolicyEnsemble(policy_list) _LOGGER.info("Training Arcus Core") policy_ensemble.train(training_trackers, domain) policy_ensemble.persist( "data/" + self.config['skill-id'] + "/core", False) domain.persist("data/" + self.config['skill-id'] + "/core/model") domain.persist_specification("data/" + self.config['skill-id'] + "/core")
def test_train_selector(pipeline, component_builder, tmpdir): # use data that include some responses training_data = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.md" ) training_data_responses = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa-responses.md" ) training_data = training_data.merge(training_data_responses) nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline}) trainer = Trainer(nlu_config) trainer.train(training_data) persisted_path = trainer.persist(tmpdir) assert trainer.pipeline loaded = Interpreter.load(persisted_path, component_builder) parsed = loaded.parse("hello") assert loaded.pipeline assert parsed is not None assert (parsed.get("response_selector").get("all_retrieval_intents")) == [ "chitchat" ] assert ( parsed.get("response_selector") .get("default") .get("response") .get("intent_response_key") ) is not None assert ( parsed.get("response_selector") .get("default") .get("response") .get("template_name") ) is not None assert ( parsed.get("response_selector") .get("default") .get("response") .get("response_templates") ) is not None ranking = parsed.get("response_selector").get("default").get("ranking") assert ranking is not None for rank in ranking: assert rank.get("confidence") is not None assert rank.get("intent_response_key") is not None
def validate_rasa_config(config: Dict): """ validates bot config.yml content for invalid entries :param config: configuration :return: None """ rasa_config = RasaNLUModelConfig(config) component_builder = ComponentBuilder() for i in range(len(rasa_config.pipeline)): component_cfg = rasa_config.for_component(i) component_builder.create_component(component_cfg, rasa_config) configuration.load(config)
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) assert seq_vec is None assert sen_vec is None
def test_train_model_on_test_pipelines(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, component_builder=component_builder) assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
async def test_eval_data(component_builder, tmpdir, project): _config = RasaNLUModelConfig({ "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", "epochs": 2 }, { "name": "ResponseSelector", "epochs": 2 }, ], "language": "en", }) config_path = os.path.join(project, "config.yml") data_importer = TrainingDataImporter.load_nlu_importer_from_config( config_path, training_data_paths=[ "data/examples/rasa/demo-rasa.md", "data/examples/rasa/demo-rasa-responses.md", ], ) (_, _, persisted_path) = await train( _config, path=tmpdir.strpath, data=data_importer, component_builder=component_builder, persist_nlu_training_data=True, ) interpreter = Interpreter.load(persisted_path, component_builder) data = await data_importer.get_nlu_data() ( intent_results, response_selection_results, entity_results, ) = get_eval_data(interpreter, data) assert len(intent_results) == 46 assert len(response_selection_results) == 46 assert len(entity_results) == 46
def test_warn_of_competing_extractors( pipeline_template: List[Dict[Text, Text]], should_warn: bool ): config = RasaNLUModelConfig({"pipeline": pipeline_template}) trainer = Trainer(config) if should_warn: with pytest.warns(UserWarning): rasa.nlu.components.warn_of_competing_extractors(trainer.pipeline) else: with pytest.warns(None) as records: rasa.nlu.components.warn_of_competing_extractors(trainer.pipeline) assert len(records) == 0
def train(): td = load_data("{}/demo_rasa.json".format(prj_dir)) _config = RasaNLUModelConfig(load_json("{}/config.json".format(prj_dir))) trainer = Trainer(_config) trainer.train(td) persisted_path = trainer.persist("{}/models".format(prj_dir)) loaded = Interpreter.load(persisted_path) assert loaded.pipeline # Inference result = loaded.parse("i'm looking for a place in the north of town") result = loaded.parse("show me chinese restaurants") result = dict(filter(lambda item: item[0] not in ["intent_ranking"], result.items())) show_dict(result)
async def test_validate_requirements_raises_exception_on_component_without_name( tmp_path: Path, ): _config = RasaNLUModelConfig( # config with a component that does not have a `name` property {"pipeline": [{ "parameter": 4 }]}) with pytest.raises(InvalidConfigError): await train( _config, data="./data/examples/rasa/demo-rasa.json", path=str(tmp_path), )
def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config): from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer component_config = {"name": "MitieFeaturizer", "return_sequence": False} featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig()) sentence = f"Hey how are you today {CLS_TOKEN}" tokens = MitieTokenizer().tokenize(sentence) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0] expected = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)