async def test_train_persist_load_with_composite_entities( component_builder: ComponentBuilder, tmp_path: Path): pipeline = pipeline_from_components("WhitespaceTokenizer", "CRFEntityExtractor") _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) (trainer, trained, persisted_path) = await rasa.nlu.train.train( _config, path=str(tmp_path), data="data/test/demo-rasa-composite-entities.yml", component_builder=component_builder, ) assert trainer.pipeline assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline text = "I am looking for an italian restaurant" assert loaded.parse(text) == trained.parse(text)
def test_spacy_featurizer_casing(spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer # if this starts failing for the default model, we should think about # removing the lower casing the spacy nlp component does when it # retrieves vectors. For compressed spacy models (e.g. models # ending in _sm) this test will most likely fail. ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) td = training_data.load_data("data/examples/rasa/demo-rasa.json") for e in td.intent_examples: doc = spacy_nlp(e.text) doc_capitalized = spacy_nlp(e.text.capitalize()) vecs = ftr._features_for_doc(doc) vecs_capitalized = ftr._features_for_doc(doc_capitalized) assert np.allclose( vecs, vecs_capitalized, atol=1e-5), "Vectors are unequal for texts '{}' and '{}'".format( e.text, e.text.capitalize())
async def test_train_tensorboard_logging(component_builder, tmpdir): from pathlib import Path tensorboard_log_dir = Path(tmpdir.strpath) / "tensorboard" assert not tensorboard_log_dir.exists() _config = RasaNLUModelConfig({ "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 3, TENSORBOARD_LOG_LEVEL: "epoch", TENSORBOARD_LOG_DIR: str(tensorboard_log_dir), EVAL_NUM_EXAMPLES: 15, EVAL_NUM_EPOCHS: 1, }, ], "language": "en", }) await train( _config, path=tmpdir.strpath, data="data/examples/rasa/demo-rasa-multi-intent.md", component_builder=component_builder, ) assert tensorboard_log_dir.exists() all_files = list(tensorboard_log_dir.rglob("*.*")) assert len(all_files) == 3
async def test_inner_linear_normalization( component_builder: ComponentBuilder, tmp_path: Path, classifier_params: Dict[Text, Any], data_path: Text, monkeypatch: MonkeyPatch, ): pipeline = as_pipeline("WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier") assert pipeline[2]["name"] == "DIETClassifier" pipeline[2].update(classifier_params) _config = RasaNLUModelConfig({"pipeline": pipeline}) (trained_model, _, persisted_path) = await rasa.nlu.train.train( _config, path=str(tmp_path), data=data_path, component_builder=component_builder, ) loaded = Interpreter.load(persisted_path, component_builder) mock = Mock() monkeypatch.setattr(train_utils, "normalize", mock.normalize) parse_data = loaded.parse("hello") intent_ranking = parse_data.get("intent_ranking") # check whether normalization had the expected effect output_sums_to_1 = sum([ intent.get("confidence") for intent in intent_ranking ]) == pytest.approx(1) assert output_sums_to_1 # check whether the normalization of rankings is reflected in intent prediction assert parse_data.get("intent") == intent_ranking[0] # normalize shouldn't have been called mock.normalize.assert_not_called()
def test_train_selector(pipeline, component_builder, tmpdir): # use data that include some responses training_data = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.md") training_data_responses = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa-responses.md") training_data = training_data.merge(training_data_responses) nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline}) trainer = Trainer(nlu_config) trainer.train(training_data) persisted_path = trainer.persist(tmpdir) assert trainer.pipeline loaded = Interpreter.load(persisted_path, component_builder) parsed = loaded.parse("hello") assert loaded.pipeline assert parsed is not None assert (parsed.get("response_selector").get("all_retrieval_intents")) == [ "chitchat" ] assert (parsed.get("response_selector").get("default").get("response").get( "intent_response_key")) is not None assert (parsed.get("response_selector").get("default").get("response").get( "template_name")) is not None assert (parsed.get("response_selector").get("default").get("response").get( "response_templates")) is not None ranking = parsed.get("response_selector").get("default").get("ranking") assert ranking is not None for rank in ranking: assert rank.get("confidence") is not None assert rank.get("intent_response_key") is not None
async def test_raise_error_on_incorrect_pipeline(component_builder, tmp_path: Path): _config = RasaNLUModelConfig( { "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "DIETClassifier", EPOCHS: 1}, ], "language": "en", } ) with pytest.raises(Exception) as e: await train( _config, path=str(tmp_path), data=DEFAULT_DATA_PATH, component_builder=component_builder, ) assert ( "'DIETClassifier' requires ['Featurizer']. " "Add required components to the pipeline." in str(e.value) )
def test_train_model_without_data(): td = load_data(DEFAULT_DATA_PATH) # language, pipeline = pipelines_for_tests()[1] # show_dict(pipeline) # exit() language = "en" pipeline = load_json( "{}/test_case/test_pipelines/config_pipeline.json".format(prj_dir)) # exit() _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config) trainer.train(td) persisted_path = trainer.persist(model_dir) loaded = Interpreter.load(persisted_path) assert loaded.pipeline # Inference # result = loaded.parse("i'm looking for a place in the north of town") result = loaded.parse("show me chinese restaurants") result = dict( filter(lambda item: item[0] not in ["intent_ranking"], result.items())) show_dict(result)
async def test_train_model_training_data_persisted(component_builder, tmpdir): _config = RasaNLUModelConfig({ "pipeline": [{ "name": "KeywordIntentClassifier" }], "language": "en" }) (trained, _, persisted_path) = await train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, component_builder=component_builder, persist_nlu_training_data=True, ) assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.model_metadata.get("training_data") is not None
async def test_elmo_train(component_builder, tmpdir): pipeline = [ {"name": "WhitespaceTokenizer"}, {"name": "ElmoFeaturizer"}, {"name": "CountVectorsFeaturizer"}, {"name": "EmbeddingIntentClassifier"}, ] _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) (trained, _, persisted_path) = await train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, component_builder=component_builder, ) assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def test_train_selector(pipeline, component_builder, tmpdir): # use data that include some responses td = load_data("data/examples/rasa/demo-rasa.md") td_responses = load_data("data/examples/rasa/demo-rasa-responses.md") td = td.merge(td_responses) td.fill_response_phrases() nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline}) trainer = Trainer(nlu_config) trainer.train(td) persisted_path = trainer.persist(tmpdir) assert trainer.pipeline loaded = Interpreter.load(persisted_path, component_builder) parsed = loaded.parse("hello") assert loaded.pipeline assert parsed is not None assert (parsed.get(RESPONSE_SELECTOR_PROPERTY_NAME).get("default").get( "full_retrieval_intent")) is not None
def test_convert_featurizer_train(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class( ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=tokenizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get_dense_features(TEXT, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get_dense_features(RESPONSE, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get_dense_features(INTENT, []) assert vecs is None
def train(self, request): # if not set will use the default project name, e.g. "default" project = parameter_or_default(request, "project", default=None) # if set will not generate a model name but use the passed one model_name = parameter_or_default(request, "model", default=None) try: model_config, data = self.extract_data_and_config(request) except Exception as e: request.setResponseCode(400) returnValue(json_to_string({"error": "{}".format(e)})) data_file = dump_to_data_file(data) request.setHeader('Content-Type', 'application/zip') try: request.setResponseCode(200) request.setHeader("Content-Disposition", "attachment") path_to_model = yield self.data_router.start_train_process( data_file, project, RasaNLUModelConfig(model_config), model_name) zipped_path = utils.zip_folder(path_to_model) zip_content = io.open(zipped_path, 'r+b').read() return returnValue(zip_content) except MaxTrainingError as e: request.setResponseCode(403) returnValue(json_to_string({"error": "{}".format(e)})) except InvalidProjectError as e: request.setResponseCode(404) returnValue(json_to_string({"error": "{}".format(e)})) except TrainingException as e: request.setResponseCode(500) returnValue(json_to_string({"error": "{}".format(e)}))
async def test_softmax_normalization( component_builder, tmp_path, classifier_params, data_path, output_length, output_should_sum_to_1, ): pipeline = as_pipeline( "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier" ) assert pipeline[2]["name"] == "DIETClassifier" pipeline[2].update(classifier_params) _config = RasaNLUModelConfig({"pipeline": pipeline}) (trained_model, _, persisted_path) = await train( _config, path=str(tmp_path), data=data_path, component_builder=component_builder, ) loaded = Interpreter.load(persisted_path, component_builder) parse_data = loaded.parse("hello") intent_ranking = parse_data.get("intent_ranking") # check that the output was correctly truncated after normalization assert len(intent_ranking) == output_length # check whether normalization had the expected effect output_sums_to_1 = sum( [intent.get("confidence") for intent in intent_ranking] ) == pytest.approx(1) assert output_sums_to_1 == output_should_sum_to_1 # check whether the normalization of rankings is reflected in intent prediction assert parse_data.get("intent") == intent_ranking[0]
async def test_set_random_seed(component_builder, tmpdir): """test if train result is the same for two runs of tf embedding""" # set fixed random seed _config = RasaNLUModelConfig( { "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "CountVectorsFeaturizer"}, {"name": "DIETClassifier", RANDOM_SEED: 1, EPOCHS: 1}, ], "language": "en", } ) # first run (trained_a, _, persisted_path_a) = await train( _config, path=tmpdir.strpath + "_a", data=DEFAULT_DATA_PATH, component_builder=component_builder, ) # second run (trained_b, _, persisted_path_b) = await train( _config, path=tmpdir.strpath + "_b", data=DEFAULT_DATA_PATH, component_builder=component_builder, ) loaded_a = Interpreter.load(persisted_path_a, component_builder) loaded_b = Interpreter.load(persisted_path_b, component_builder) result_a = loaded_a.parse("hello")["intent"]["confidence"] result_b = loaded_b.parse("hello")["intent"]["confidence"] assert result_a == result_b
def test_spacy_featurizer_using_empty_model(): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer import spacy sentence = "This test is using an empty spaCy model" model = spacy.blank("en") doc = model(sentence) ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert seq_vecs is None assert sen_vecs is None
async def test_train_persist_load_with_composite_entities( classifier_params, component_builder, tmpdir): pipeline = as_pipeline("WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier") assert pipeline[2]["name"] == "DIETClassifier" pipeline[2].update(classifier_params) _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) (trainer, trained, persisted_path) = await rasa.nlu.train.train( _config, path=tmpdir.strpath, data="data/test/demo-rasa-composite-entities.yml", component_builder=component_builder, ) assert trainer.pipeline assert trained.pipeline loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline text = "I am looking for an italian restaurant" assert loaded.parse(text) == trained.parse(text)
def test_duckling_entity_extractor_and_synonyms(component_builder): _config = RasaNLUModelConfig({ "pipeline": [{ "name": "DucklingHTTPExtractor" }, { "name": "EntitySynonymMapper" }] }) _config.set_component_attr(0, dimensions=["number"]) duckling = component_builder.create_component(_config.for_component(0), _config) synonyms = component_builder.create_component(_config.for_component(1), _config) message = Message("He was 6 feet away") duckling.process(message) # checks that the synonym processor # can handle entities that have int values synonyms.process(message) assert message is not None
def test_set_attr_on_component(): _config = RasaNLUModelConfig( { "language": "en", "pipeline": [ {"name": "SpacyNLP"}, {"name": "SpacyTokenizer"}, {"name": "SpacyFeaturizer"}, {"name": "DIETClassifier"}, ], } ) idx_classifier = _config.component_names.index("DIETClassifier") idx_tokenizer = _config.component_names.index("SpacyTokenizer") _config.set_component_attr(idx_classifier, epochs=10) assert _config.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"} assert _config.for_component(idx_classifier) == { "name": "DIETClassifier", "epochs": 10, }
def blank_config() -> RasaNLUModelConfig: return RasaNLUModelConfig({"language": "en", "pipeline": []})
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "spacy_doc": spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], "spacy_doc": spacy_nlp("central indian restaurant"), }, ), ] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = "anywhere in the west" doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"] feats = ext._sentence_to_features(crf_format) assert "BOS" in feats[0] assert "EOS" in feats[-1] assert feats[1]["0:low"] == "in" sentence = "anywhere in the west" ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get("entities") == [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "Entity without extractor remains" assert filtered[1].get("entities") == [{ "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }], "Only CRFEntityExtractor entity annotation remains" assert examples[1].get("entities")[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, "Original examples are not mutated"
def test_duckling_entity_extractor(component_builder): httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"Today","start":0,"value":{"values":[{ "value":"2018-11-13T00:00:00.000-08:00","grain":"day", "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00", "grain":"day","type":"value"},"end":5, "dim":"time","latent":false},{"body":"the 5th","start":9, "value":{"values":[{ "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-01-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-02-05T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"},"end":16,"dim":"time", "latent":false},{"body":"5th of May","start":13,"value":{ "values":[{ "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2020-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2021-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}], "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"},"end":23,"dim":"time", "latent":false},{"body":"tomorrow","start":37,"value":{ "values":[{ "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"},"end":45,"dim":"time", "latent":false}]""", ) httpretty.enable() _config = RasaNLUModelConfig( {"pipeline": [{ "name": "DucklingHTTPExtractor" }]}) _config.set_component_attr(0, dimensions=["time"], timezone="UTC", url="http://localhost:8000") duckling = component_builder.create_component(_config.for_component(0), _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 4 # Test duckling with a defined date httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"tomorrow","start":12,"value":{"values":[{ "value":"2013-10-13T00:00:00.000Z","grain":"day", "type":"value"}],"value":"2013-10-13T00:00:00.000Z", "grain":"day","type":"value"},"end":20, "dim":"time","latent":false}]""", ) # 1381536182 == 2013/10/12 02:03:02 message = Message("Let us meet tomorrow.", time="1381536182") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z" # Test dimension filtering includes only specified dimensions _config = RasaNLUModelConfig( {"pipeline": [{ "name": "DucklingHTTPExtractor" }]}) _config.set_component_attr(0, dimensions=["number"], url="http://localhost:8000") ducklingNumber = component_builder.create_component( _config.for_component(0), _config) httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"Yesterday","start":0,"value":{"values":[{ "value":"2019-02-28T00:00:00.000+01:00","grain":"day", "type":"value"}],"value":"2019-02-28T00:00:00.000+01:00", "grain":"day","type":"value"},"end":9,"dim":"time"}, {"body":"5","start":21,"value":{"value":5,"type":"value"}, "end":22,"dim":"number"}]""", ) message = Message("Yesterday there were 5 people in a room") ducklingNumber.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "5" assert entities[0]["value"] == 5
def test_run_cv_evaluation_with_response_selector(monkeypatch: MonkeyPatch): training_data_obj = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.yml" ) training_data_responses_obj = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa-responses.yml" ) training_data_obj = training_data_obj.merge(training_data_responses_obj) nlu_config = RasaNLUModelConfig( { "language": "en", "pipeline": [ {"name": "WhitespaceTokenizer"}, {"name": "CountVectorsFeaturizer"}, {"name": "DIETClassifier", EPOCHS: 2}, {"name": "ResponseSelector", EPOCHS: 2}, ], } ) # mock training trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) mock = Mock(return_value=Interpreter(trainer.pipeline, None)) monkeypatch.setattr(Trainer, "train", mock) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( training_data_obj, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, report_as_dict=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert all(key in intent_results.evaluation for key in ["errors", "report"]) assert any( isinstance(intent_report, dict) and intent_report.get("confused_with") is not None for intent_report in intent_results.evaluation["report"].values() ) assert len(response_selection_results.train["Accuracy"]) == n_folds assert len(response_selection_results.train["Precision"]) == n_folds assert len(response_selection_results.train["F1-score"]) == n_folds assert len(response_selection_results.test["Accuracy"]) == n_folds assert len(response_selection_results.test["Precision"]) == n_folds assert len(response_selection_results.test["F1-score"]) == n_folds assert all( key in response_selection_results.evaluation for key in ["errors", "report"] ) assert any( isinstance(intent_report, dict) and intent_report.get("confused_with") is not None for intent_report in response_selection_results.evaluation["report"].values() ) assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds for extractor_evaluation in entity_results.evaluation.values(): assert all(key in extractor_evaluation for key in ["errors", "report"])
def get_nlu_model(self) -> RasaNLUModelConfig: return RasaNLUModelConfig({ "language": self.language, "pipeline": self.pipeline })
def test_run_cv_evaluation_with_response_selector(): training_data_obj = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.md") training_data_responses_obj = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa-responses.md") training_data_obj = training_data_obj.merge(training_data_responses_obj) nlu_config = RasaNLUModelConfig({ "language": "en", "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 2 }, { "name": "ResponseSelector", EPOCHS: 2 }, ], }) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( training_data_obj, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert all(key in intent_results.evaluation for key in ["errors", "report"]) assert len(response_selection_results.train["Accuracy"]) == n_folds assert len(response_selection_results.train["Precision"]) == n_folds assert len(response_selection_results.train["F1-score"]) == n_folds assert len(response_selection_results.test["Accuracy"]) == n_folds assert len(response_selection_results.test["Precision"]) == n_folds assert len(response_selection_results.test["F1-score"]) == n_folds assert all(key in response_selection_results.evaluation for key in ["errors", "report"]) assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds for extractor_evaluation in entity_results.evaluation.values(): assert all(key in extractor_evaluation for key in ["errors", "report"])
def test_incremental_train_featurization(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) # Test featurization of message expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) new_patterns = [ { "pattern": "\\btoday*", "name": "day", "usage": "intent" }, { "pattern": "\\bhey+", "name": "hello", "usage": "intent" }, ] message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) loaded_featurizer.train( TrainingData([message], regex_features=patterns + new_patterns), RasaNLUModelConfig(), ) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[-2] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{ "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 0}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
async def test_adjusting_layers_incremental_training( component_builder: ComponentBuilder, tmpdir: Path): """Tests adjusting sparse layers of `ResponseSelector` to increased sparse feature sizes during incremental training. Testing is done by checking the layer sizes. Checking if they were replaced correctly is also important and is done in `test_replace_dense_for_sparse_layers` in `test_rasa_layers.py`. """ iter1_data_path = "data/test_incremental_training/iter1/" iter2_data_path = "data/test_incremental_training/" pipeline = [ { "name": "WhitespaceTokenizer" }, { "name": "LexicalSyntacticFeaturizer" }, { "name": "RegexFeaturizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "CountVectorsFeaturizer", "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, { "name": "ResponseSelector", EPOCHS: 1 }, ] _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) (_, trained, persisted_path) = await rasa.nlu.train.train( _config, path=str(tmpdir), data=iter1_data_path, component_builder=component_builder, ) assert trained.pipeline old_data_signature = trained.pipeline[-1].model.data_signature old_predict_data_signature = trained.pipeline[ -1].model.predict_data_signature message = Message.build(text="Rasa is great!") trained.featurize_message(message) old_sparse_feature_sizes = message.get_sparse_feature_sizes(attribute=TEXT) initial_rs_layers = ( trained.pipeline[-1].model._tf_layers["sequence_layer.text"]. _tf_layers["feature_combining"]) initial_rs_sequence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] initial_rs_sentence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0] initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0] assert initial_rs_sequence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert initial_rs_sentence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) loaded = Interpreter.load( persisted_path, component_builder, new_config=_config, ) assert loaded.pipeline assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!") (_, trained, _) = await rasa.nlu.train.train( _config, path=str(tmpdir), data=iter2_data_path, component_builder=component_builder, model_to_finetune=loaded, ) assert trained.pipeline message = Message.build(text="Rasa is great!") trained.featurize_message(message) new_sparse_feature_sizes = message.get_sparse_feature_sizes(attribute=TEXT) final_rs_layers = ( trained.pipeline[-1].model._tf_layers["sequence_layer.text"]. _tf_layers["feature_combining"]) final_rs_sequence_layer = final_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] final_rs_sentence_layer = final_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0] final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0] assert final_rs_sequence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert final_rs_sentence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) # check if the data signatures were correctly updated new_data_signature = trained.pipeline[-1].model.data_signature new_predict_data_signature = trained.pipeline[ -1].model.predict_data_signature iter2_data = load_data(iter2_data_path) expected_sequence_lengths = len([ message for message in iter2_data.training_examples if message.get(INTENT_RESPONSE_KEY) ]) def test_data_signatures( new_signature: Dict[Text, Dict[Text, List[FeatureArray]]], old_signature: Dict[Text, Dict[Text, List[FeatureArray]]], ): # Wherever attribute / feature_type signature is not # expected to change, directly compare it to old data signature. # Else compute its expected signature and compare attributes_expected_to_change = [TEXT] feature_types_expected_to_change = [ FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE, ] for attribute, signatures in new_signature.items(): for feature_type, feature_signatures in signatures.items(): if feature_type == "sequence_lengths": assert feature_signatures[ 0].units == expected_sequence_lengths elif feature_type not in feature_types_expected_to_change: assert feature_signatures == old_signature.get( attribute).get(feature_type) else: for index, feature_signature in enumerate( feature_signatures): if (feature_signature.is_sparse and attribute in attributes_expected_to_change): assert feature_signature.units == sum( new_sparse_feature_sizes.get(feature_type)) else: # dense signature or attributes that are not # expected to change can be compared directly assert ( feature_signature.units == old_signature.get( attribute).get(feature_type)[index].units) test_data_signatures(new_data_signature, old_data_signature) test_data_signatures(new_predict_data_signature, old_predict_data_signature)
def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: from seq2label.input import build_input_func from seq2label.model import Model raw_config = self.component_config print(raw_config) if 'result_dir' not in raw_config: raw_config['result_dir'] = tempfile.mkdtemp() model = Model(raw_config) config = model.get_default_config() config.update(raw_config) # task_status = TaskStatus(config) # read data according configure train_data_generator_func = kwargs.get('addons_tf_input_fn') corpus_meta_data = kwargs.get('addons_tf_input_meta') config['tags_data'] = corpus_meta_data['label'] config['num_classes'] = len(config['tags_data']) print('') # build model according configure # send START status to monitor system # task_status.send_status(task_status.START) # train and evaluate model train_input_func = build_input_func(train_data_generator_func, config) # train_iterator = train_input_func() # import tensorflow as tf # import sys # # with tf.Session() as sess: # sess.run(tf.tables_initializer()) # # counter = 0 # while True: # try: # value = sess.run(train_iterator[0]['words']) # counter += 1 # print(value) # break # except tf.errors.OutOfRangeError: # break # # print(counter) # # # sys.exit(0) evaluate_result, export_results, final_saved_model = model.train_and_eval_then_save( train_input_func, None, config) # task_status.send_status(task_status.DONE) self.result_dir = final_saved_model
async def test_sparse_feature_sizes_decreased_incremental_training( iter1_path: Text, iter2_path: Text, should_raise_exception: bool, component_builder: ComponentBuilder, tmpdir: Path, ): pipeline = [ { "name": "WhitespaceTokenizer" }, { "name": "LexicalSyntacticFeaturizer" }, { "name": "RegexFeaturizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "CountVectorsFeaturizer", "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, { "name": "ResponseSelector", EPOCHS: 1 }, ] _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) (_, trained, persisted_path) = await rasa.nlu.train.train( _config, path=str(tmpdir), data=iter1_path, component_builder=component_builder, ) assert trained.pipeline loaded = Interpreter.load( persisted_path, component_builder, new_config=_config, ) assert loaded.pipeline assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!") if should_raise_exception: with pytest.raises(Exception) as exec_info: (_, trained, _) = await rasa.nlu.train.train( _config, path=str(tmpdir), data=iter2_path, component_builder=component_builder, model_to_finetune=loaded, ) assert "Sparse feature sizes have decreased" in str(exec_info.value) else: (_, trained, _) = await rasa.nlu.train.train( _config, path=str(tmpdir), data=iter2_path, component_builder=component_builder, model_to_finetune=loaded, ) assert trained.pipeline