def test_data_merging(files): td_reference = load_data(files[0]) td = load_data(files[1]) assert len(td.entity_examples) == len(td_reference.entity_examples) assert len(td.intent_examples) == len(td_reference.intent_examples) assert len(td.training_examples) == len(td_reference.training_examples) assert td.intents == td_reference.intents assert td.entities == td_reference.entities assert td.entity_synonyms == td_reference.entity_synonyms assert td.regex_features == td_reference.regex_features
def test_get_nlu_file(project): data_file = os.path.join(project, "data/nlu.yml") nlu_directory = rasa.shared.data.get_nlu_directory(data_file) nlu_files = os.listdir(nlu_directory) original = load_data(data_file) copied = load_data(nlu_directory) assert nlu_files[0].endswith("nlu.yml") assert original.intent_examples == copied.intent_examples
def test_markdown_single_sections(): td_regex_only = load_data( "data/test/markdown_single_sections/regex_only.md") assert td_regex_only.regex_features == [{ "name": "greet", "pattern": r"hey[^\s]*" }] td_syn_only = load_data( "data/test/markdown_single_sections/synonyms_only.md") assert td_syn_only.entity_synonyms == { "Chines": "chinese", "Chinese": "chinese" }
def test_predict(): model_path = train_nlu( nlu_data=NLU_DATA_PATH, config="tests/configs/sparse-naive-bayes-intent-classifier-config.yml", output="models", ) interpreter = load_interpreter(model_path) # Get features from the pipeline and prepare data in the format sklearn # expects. training_data = load_data(NLU_DATA_PATH) for example in training_data.intent_examples: interpreter.featurize_message(example) model = interpreter.interpreter.pipeline[-1] X, y = model.prepare_data(training_data) # Fit the equivalent sklearn classifier. from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB(alpha=0.1, binarize=0.0, fit_prior=True) clf.fit(X, y) # Check that predictions agree. assert (clf.predict_proba(X) == model.predict_prob(X)).all() assert (clf.predict(X) == model.predict(X)[0][:, 0]).all()
def test_repeated_entities(tmp_path): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": "3" } ] } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT) start, end = MitieEntityExtractor.find_entity(entities[0], example.get(TEXT), tokens) assert start == 9 assert end == 10
def test_multiword_entities(tmp_path): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT) start, end = MitieEntityExtractor.find_entity(entities[0], example.get(TEXT), tokens) assert start == 4 assert end == 7
def test_nonascii_entities(tmp_path): data = """ { "luis_schema_version": "5.0", "utterances" : [ { "text": "I am looking for a ßäæ ?€ö) item", "intent": "unk", "entities": [ { "entity": "description", "startPos": 19, "endPos": 26 } ] } ] }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 entity = entities[0] assert entity["value"] == "ßäæ ?€ö)" assert entity["start"] == 19 assert entity["end"] == 27 assert entity["entity"] == "description"
def test_dialogflow_data(): td = load_data("data/examples/dialogflow/") assert not td.is_empty() assert len(td.entity_examples) == 5 assert len(td.intent_examples) == 24 assert len(td.training_examples) == 24 assert len(td.lookup_tables) == 2 assert td.intents == {"affirm", "goodbye", "hi", "inform"} assert td.entities == {"cuisine", "location"} non_trivial_synonyms = { k: v for k, v in td.entity_synonyms.items() if k != v } assert non_trivial_synonyms == { "mexico": "mexican", "china": "chinese", "india": "indian", } # The order changes based on different computers hence the grouping assert {td.lookup_tables[0]["name"], td.lookup_tables[1]["name"]} == { "location", "cuisine", } assert { len(td.lookup_tables[0]["elements"]), len(td.lookup_tables[1]["elements"]), } == {4, 6}
def test_lookup_table_md(): lookup_fname = "data/test/lookup_tables/plates.txt" td_lookup = load_data("data/test/lookup_tables/lookup_table.md") assert not td_lookup.is_empty() assert len(td_lookup.lookup_tables) == 1 assert td_lookup.lookup_tables[0]["name"] == "plates" assert td_lookup.lookup_tables[0]["elements"] == lookup_fname
def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingData: from rasa.shared.nlu.training_data import loading training_data_sets = [ loading.load_data(nlu_file, language) for nlu_file in paths ] return TrainingData().merge(*training_data_sets)
def test_markdown_empty_section(): data = load_data("data/test/markdown_single_sections/empty_section.md") assert data.regex_features == [{"name": "greet", "pattern": r"hey[^\s]*"}] assert not data.entity_synonyms assert len(data.lookup_tables) == 1 assert data.lookup_tables[0]["name"] == "chinese" assert "Chinese" in data.lookup_tables[0]["elements"] assert "Chines" in data.lookup_tables[0]["elements"]
def test_wit_data(): td = load_data("data/examples/wit/demo-flights.json") assert not td.is_empty() assert len(td.entity_examples) == 4 assert len(td.intent_examples) == 1 assert len(td.training_examples) == 4 assert td.entity_synonyms == {} assert td.intents == {"flight_booking"} assert td.entities == {"location", "datetime"}
def getUtterance(intent_): train_data = load_data(dataFile) training_examples = OrderedDict() INTENT = 'intent' for example in [e.as_dict_nlu() for e in train_data.training_examples]: intent = example[INTENT] training_examples.setdefault(intent, []) training_examples[intent].append(example) return training_examples[intent_][0]['text']
def test_luis_data(): td = load_data("data/examples/luis/demo-restaurants_v5.json") assert not td.is_empty() assert len(td.entity_examples) == 8 assert len(td.intent_examples) == 28 assert len(td.training_examples) == 28 assert td.entity_synonyms == {} assert td.intents == {"affirm", "goodbye", "greet", "inform"} assert td.entities == {"location", "cuisine"}
def test_training_data_conversion(tmpdir, data_file, gold_standard_file, output_format, language): out_path = tmpdir.join("rasa_nlu_data.json") convert_training_data(data_file, out_path.strpath, output_format, language) td = load_data(out_path.strpath, language) assert td.entity_examples != [] assert td.intent_examples != [] gold_standard = load_data(gold_standard_file, language) cmp_message_list(td.entity_examples, gold_standard.entity_examples) cmp_message_list(td.intent_examples, gold_standard.intent_examples) assert td.entity_synonyms == gold_standard.entity_synonyms # converting the converted file back to original # file format and performing the same tests rto_path = tmpdir.join("data_in_original_format.txt") convert_training_data(out_path.strpath, rto_path.strpath, "json", language) rto = load_data(rto_path.strpath, language) cmp_message_list(gold_standard.entity_examples, rto.entity_examples) cmp_message_list(gold_standard.intent_examples, rto.intent_examples) assert gold_standard.entity_synonyms == rto.entity_synonyms
async def train( nlu_config: Union[Text, Dict, RasaNLUModelConfig], data: Union[Text, "TrainingDataImporter"], path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, storage: Optional[Text] = None, component_builder: Optional[ComponentBuilder] = None, training_data_endpoint: Optional[EndpointConfig] = None, persist_nlu_training_data: bool = False, model_to_finetune: Optional[Interpreter] = None, **kwargs: Any, ) -> Tuple[Trainer, Interpreter, Optional[Text]]: """Loads the trainer and the data and runs the training of the model.""" from rasa.shared.importers.importer import TrainingDataImporter if not isinstance(nlu_config, RasaNLUModelConfig): nlu_config = config.load(nlu_config) # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess trainer = Trainer( nlu_config, component_builder, model_to_finetune=model_to_finetune ) persistor = create_persistor(storage) if training_data_endpoint is not None: training_data = await load_data_from_endpoint( training_data_endpoint, nlu_config.language ) elif isinstance(data, TrainingDataImporter): training_data = await data.get_nlu_data(nlu_config.language) else: training_data = load_data(data, nlu_config.language) training_data.print_stats() if training_data.entity_roles_groups_used(): rasa.shared.utils.common.mark_as_experimental_feature( "Entity Roles and Groups feature" ) interpreter = trainer.train(training_data, **kwargs) if path: persisted_path = trainer.persist( path, persistor, fixed_model_name, persist_nlu_training_data ) else: persisted_path = None return trainer, interpreter, persisted_path
def test_spacy_intent_featurizer(spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel): td = loading.load_data("data/examples/rasa/demo-rasa.json") spacy_nlp_component.process_training_data(td, spacy_model) spacy_featurizer = create_spacy_featurizer({}) spacy_featurizer.process_training_data(td) intent_features_exist = np.array([ True if example.get("intent_features") is not None else False for example in td.intent_examples ]) # no intent features should have been set assert not any(intent_features_exist)
def test_composite_entities_data(): td = load_data("data/test/demo-rasa-composite-entities.yml") assert not td.is_empty() assert len(td.entity_examples) == 11 assert len(td.intent_examples) == 29 assert len(td.training_examples) == 29 assert td.entity_synonyms == {"SF": "San Fransisco"} assert td.intents == {"order_pizza", "book_flight", "chitchat", "affirm"} assert td.entities == {"location", "topping", "size"} assert td.entity_groups == {"1", "2"} assert td.entity_roles == {"to", "from"} assert td.number_of_examples_per_entity["entity 'location'"] == 8 assert td.number_of_examples_per_entity["group '1'"] == 9 assert td.number_of_examples_per_entity["role 'from'"] == 3
async def _convert_nlu_training_data( in_path: Text, out_path: Text, language: Text, ): if rasa.shared.data.is_likely_yaml_file(out_path): from rasa.shared.nlu.training_data.loading import load_data from rasa.shared.nlu.training_data.formats.rasa_yaml import RasaYAMLWriter training_data = load_data(in_path, language) RasaYAMLWriter().dump(out_path, training_data) else: from rasa.nlu.convert import convert_training_data convert_training_data( in_path, out_path, Path(out_path).suffix.replace('.', ''), language, )
async def load_data_from_endpoint( data_endpoint: EndpointConfig, language: Optional[Text] = "en" ) -> "TrainingData": """Load training data from a URL.""" import requests if not utils.is_url(data_endpoint.url): raise requests.exceptions.InvalidURL(data_endpoint.url) try: response = await data_endpoint.request("get") response.raise_for_status() temp_data_file = io_utils.create_temporary_file(response.content, mode="w+b") training_data = load_data(temp_data_file, language) return training_data except Exception as e: logger.warning(f"Could not retrieve training data from URL:\n{e}")
def test_spacy_intent_featurizer(spacy_nlp_component): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer td = loading.load_data("data/examples/rasa/demo-rasa.json") spacy_nlp_component.train(td, config=None) spacy_featurizer = SpacyFeaturizer() spacy_featurizer.train(td, config=None) intent_features_exist = np.array( [ True if example.get("intent_features") is not None else False for example in td.intent_examples ] ) # no intent features should have been set assert not any(intent_features_exist)
def split_nlu_data(args: argparse.Namespace) -> None: """Load data from a file path and split the NLU data into test and train examples. Args: args: Commandline arguments """ from rasa.shared.nlu.training_data.loading import load_data from rasa.shared.nlu.training_data.util import get_file_format data_path = rasa.cli.utils.get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH) data_path = rasa.shared.data.get_nlu_directory(data_path) nlu_data = load_data(data_path) fformat = get_file_format(data_path) train, test = nlu_data.train_test_split(args.training_fraction, args.random_seed) train.persist(args.out, filename=f"training_data.{fformat}") test.persist(args.out, filename=f"test_data.{fformat}")
def test_custom_attributes(tmp_path): data = """ { "rasa_nlu_data": { "common_examples" : [ { "intent": "happy", "text": "I'm happy.", "sentiment": 0.8 } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.training_examples) == 1 example = td.training_examples[0] assert example.get("sentiment") == 0.8
def test_entities_synonyms(tmp_path): data = """ { "rasa_nlu_data": { "entity_synonyms": [ { "value": "nyc", "synonyms": ["New York City", "nyc", "the big apple"] } ], "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "NYC" } ] }, { "text": "show me flights to nyc", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 22, "value": "nyc" } ] } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert td.entity_synonyms["New York City"] == "nyc"
def test_spacy_featurizer_casing(spacy_nlp): # if this starts failing for the default model, we should think about # removing the lower casing the spacy nlp component does when it # retrieves vectors. For compressed spacy models (e.g. models # ending in _sm) this test will most likely fail. ftr = create_spacy_featurizer({}) td = loading.load_data("data/examples/rasa/demo-rasa.json") for e in td.intent_examples: doc = spacy_nlp(e.get(TEXT)) doc_capitalized = spacy_nlp(e.get(TEXT).capitalize()) vecs = ftr._features_for_doc(doc) vecs_capitalized = ftr._features_for_doc(doc_capitalized) assert np.allclose( vecs, vecs_capitalized, atol=1e-5), "Vectors are unequal for texts '{}' and '{}'".format( e.text, e.text.capitalize())
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: fake_data_count = self.DEFAULT_FAKE_DATA_COUNT for importer in self.config["importers"]: if importer.get("name") == "rasam.PlaceholderImporter": fake_data_count = importer.get("fake_data_count", self.DEFAULT_FAKE_DATA_COUNT) faker_ = faker.Faker() faker_.seed_instance(fake_data_count) training_data = [loading.load_data(nlu_file, language) for nlu_file in self._nlu_files] new_training_data = [] for data in training_data: training_examples = [] example: Message for example in data.training_examples: if example.get("intent"): matches = [i async for i in self.find_placeholders(example.data.get("text"))] if matches: async for new_message in self.replace_placeholders(example, faker_, matches, fake_data_count): training_examples.append(new_message) else: training_examples.append(example) else: training_examples.append(example) new_training_data.append( TrainingData( training_examples, data.entity_synonyms, data.regex_features, data.lookup_tables, data.responses ) ) merged_training_data = TrainingData().merge(*new_training_data) merged_training_data._fill_response_phrases() return merged_training_data
from rasa.shared.nlu.training_data.loading import load_data from rasa.nlu.model import Trainer from rasa.nlu import config data_path = 'data/nlu_textual_queries.md' # data_path = 'data/nlu.md' training_data = load_data(data_path) trainer = Trainer(config.load()) trainer.train(training_data) directory = '../data/models/rasa/nlu' trainer.persist(directory)
async def test_adjusting_layers_incremental_training( create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], process_message: Callable[..., Message], ): """Tests adjusting sparse layers of `ResponseSelector` to increased sparse feature sizes during incremental training. Testing is done by checking the layer sizes. Checking if they were replaced correctly is also important and is done in `test_replace_dense_for_sparse_layers` in `test_rasa_layers.py`. """ iter1_data_path = "data/test_incremental_training/iter1/" iter2_data_path = "data/test_incremental_training/" pipeline = [ { "component": WhitespaceTokenizer }, { "component": LexicalSyntacticFeaturizer }, { "component": RegexFeaturizer }, { "component": CountVectorsFeaturizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, iter1_data_path) response_selector = create_response_selector({EPOCHS: 1}) response_selector.train(training_data=training_data) old_data_signature = response_selector.model.data_signature old_predict_data_signature = response_selector.model.predict_data_signature message = Message(data={TEXT: "Rasa is great!"}) message = process_message(loaded_pipeline, message) message2 = copy.deepcopy(message) classified_message = response_selector.process([message])[0] old_sparse_feature_sizes = classified_message.get_sparse_feature_sizes( attribute=TEXT) initial_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] initial_rs_sequence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] initial_rs_sentence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0] initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0] assert initial_rs_sequence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert initial_rs_sentence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) loaded_selector = load_response_selector({EPOCHS: 1}) classified_message2 = loaded_selector.process([message2])[0] assert classified_message2.fingerprint() == classified_message.fingerprint( ) training_data2, loaded_pipeline2 = train_and_preprocess( pipeline, iter2_data_path) response_selector.train(training_data=training_data2) new_message = Message.build(text="Rasa is great!") new_message = process_message(loaded_pipeline2, new_message) classified_new_message = response_selector.process([new_message])[0] new_sparse_feature_sizes = classified_new_message.get_sparse_feature_sizes( attribute=TEXT) final_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] final_rs_sequence_layer = final_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] final_rs_sentence_layer = final_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0] final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0] assert final_rs_sequence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert final_rs_sentence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) # check if the data signatures were correctly updated new_data_signature = response_selector.model.data_signature new_predict_data_signature = response_selector.model.predict_data_signature iter2_data = load_data(iter2_data_path) expected_sequence_lengths = len([ message for message in iter2_data.training_examples if message.get(INTENT_RESPONSE_KEY) ]) def test_data_signatures( new_signature: Dict[Text, Dict[Text, List[FeatureArray]]], old_signature: Dict[Text, Dict[Text, List[FeatureArray]]], ): # Wherever attribute / feature_type signature is not # expected to change, directly compare it to old data signature. # Else compute its expected signature and compare attributes_expected_to_change = [TEXT] feature_types_expected_to_change = [ FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE, ] for attribute, signatures in new_signature.items(): for feature_type, feature_signatures in signatures.items(): if feature_type == "sequence_lengths": assert feature_signatures[ 0].units == expected_sequence_lengths elif feature_type not in feature_types_expected_to_change: assert feature_signatures == old_signature.get( attribute).get(feature_type) else: for index, feature_signature in enumerate( feature_signatures): if (feature_signature.is_sparse and attribute in attributes_expected_to_change): assert feature_signature.units == sum( new_sparse_feature_sizes.get(feature_type)) else: # dense signature or attributes that are not # expected to change can be compared directly assert ( feature_signature.units == old_signature.get( attribute).get(feature_type)[index].units) test_data_signatures(new_data_signature, old_data_signature) test_data_signatures(new_predict_data_signature, old_predict_data_signature)
def test_lookup_table_yaml(): td_lookup = load_data("data/test/lookup_tables/lookup_table.yml") assert not td_lookup.is_empty() assert len(td_lookup.lookup_tables) == 1 assert td_lookup.lookup_tables[0]["name"] == "plates" assert len(td_lookup.lookup_tables[0]["elements"]) == 5
def test_nlu_training_data_provider( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, config_path: Text, nlu_data_path: Text, ): # create a resource and an importer resource = Resource("xy") importer = TrainingDataImporter.load_from_config( config_path=config_path, training_data_paths=[nlu_data_path]) # check the default configuration is as expected config_1 = NLUTrainingDataProvider.get_default_config() assert config_1["language"] is None assert config_1["persist"] is False # create a provider with persist == True provider_1 = NLUTrainingDataProvider.create( { "language": "en", "persist": True }, default_model_storage, resource, default_execution_context, ) assert isinstance(provider_1, NLUTrainingDataProvider) # check the data provided is as expected data_0 = provider_1.provide(importer) data_1 = importer.get_nlu_data(language="en") assert data_0.fingerprint() == data_1.fingerprint() # check the data was persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) data = load_data(resource_name=data_file, language="en") assert os.path.isfile(data_file) assert isinstance(data, TrainingData) # delete the persisted data os.remove(data_file) assert not os.path.isfile(data_file) # create a provider with persist == False provider_2 = NLUTrainingDataProvider.create( { "language": "en", "persist": False }, default_model_storage, resource, default_execution_context, ) provider_2.provide(importer) # check the data was not persisted with default_model_storage.read_from(resource) as resource_directory: data_file = os.path.join(str(resource_directory), DEFAULT_TRAINING_DATA_OUTPUT_PATH) assert not os.path.isfile(data_file)