def test_fingerprint_is_same_when_loading_data_again(): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/examples/rasa/demo-rasa.yml", "data/examples/rasa/demo-rasa-responses.yml", ] td1 = training_data_from_paths(files, language="en") td2 = training_data_from_paths(files, language="en") assert td1.fingerprint() == td2.fingerprint()
def test_demo_data(files: List[Text]): from rasa.shared.importers.utils import training_data_from_paths trainingdata = training_data_from_paths(files, language="en") assert trainingdata.intents == { "affirm", "greet", "restaurant_search", "goodbye", "chitchat", } assert trainingdata.entities == {"location", "cuisine"} assert set(trainingdata.responses.keys()) == { "utter_chitchat/ask_name", "utter_chitchat/ask_weather", } assert len(trainingdata.training_examples) == 46 assert len(trainingdata.intent_examples) == 46 assert len(trainingdata.response_examples) == 4 assert len(trainingdata.entity_examples) == 11 assert len(trainingdata.responses) == 2 assert trainingdata.entity_synonyms == { "Chines": "chinese", "Chinese": "chinese", "chines": "chinese", "vegg": "vegetarian", "veggie": "vegetarian", } assert trainingdata.regex_features == [ {"name": "greet", "pattern": r"hey[^\s]*"}, {"name": "zipcode", "pattern": r"[0-9]{5}"}, ]
def test_train_test_split(filepaths: List[Text]): from rasa.shared.importers.utils import training_data_from_paths training_data = training_data_from_paths(filepaths, language="en") assert training_data.intents == { "affirm", "greet", "restaurant_search", "goodbye", "chitchat", } assert training_data.entities == {"location", "cuisine"} assert set(training_data.responses.keys()) == { "utter_chitchat/ask_name", "utter_chitchat/ask_weather", } NUM_TRAIN_EXAMPLES = 46 NUM_RESPONSE_EXAMPLES = 4 assert len(training_data.training_examples) == NUM_TRAIN_EXAMPLES assert len(training_data.intent_examples) == NUM_TRAIN_EXAMPLES assert len(training_data.response_examples) == NUM_RESPONSE_EXAMPLES for train_percent in range(50, 95, 5): train_frac = train_percent / 100.0 train_split, test_split = training_data.train_test_split(train_frac) assert (len(test_split.training_examples) + len(train_split.training_examples) == NUM_TRAIN_EXAMPLES) num_classes = ( len(training_data.number_of_examples_per_intent.keys()) + -len(training_data.retrieval_intents) + len(training_data.number_of_examples_per_response)) expected_num_train_examples_floor = int(train_frac * NUM_TRAIN_EXAMPLES) if NUM_TRAIN_EXAMPLES - expected_num_train_examples_floor < num_classes: expected_num_train_examples_floor = NUM_TRAIN_EXAMPLES - num_classes - 1 assert len( train_split.training_examples) >= expected_num_train_examples_floor assert (len(train_split.training_examples) <= expected_num_train_examples_floor + 1) assert len(training_data.number_of_examples_per_intent.keys()) == len( test_split.number_of_examples_per_intent.keys()) assert len(training_data.number_of_examples_per_intent.keys()) == len( train_split.number_of_examples_per_intent.keys()) assert len( training_data.number_of_examples_per_response.keys()) == len( train_split.number_of_examples_per_response.keys()) assert len( training_data.number_of_examples_per_response.keys()) == len( train_split.number_of_examples_per_response.keys())
def test_fingerprint_is_different_when_lookup_table_has_changed( monkeypatch: MonkeyPatch, ): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/test/lookup_tables/lookup_table.json", ] td1 = training_data_from_paths(files, language="en") fingerprint1 = td1.fingerprint() monkeypatch.setattr( TrainingData, "_load_lookup_table", Mock(return_value={"name": "plates", "elements": "tacos\nbeef"}), ) td2 = training_data_from_paths(files, language="en") fingerprint2 = td2.fingerprint() assert fingerprint1 != fingerprint2
def test_training_data_fingerprint_incorporates_tokens( whitespace_tokenizer: WhitespaceTokenizer, ): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/examples/rasa/demo-rasa.yml", "data/examples/rasa/demo-rasa-responses.yml", ] training_data = training_data_from_paths(files, language="en") fp1 = training_data.fingerprint() whitespace_tokenizer.process_training_data(training_data) # training data fingerprint has changed assert fp1 != training_data.fingerprint()
def test_train_test_split_with_random_seed(filepaths): from rasa.shared.importers.utils import training_data_from_paths td = training_data_from_paths(filepaths, language="en") td_train_1, td_test_1 = td.train_test_split(train_frac=0.8, random_seed=1) td_train_2, td_test_2 = td.train_test_split(train_frac=0.8, random_seed=1) train_1_intent_examples = [e.get(TEXT) for e in td_train_1.intent_examples] train_2_intent_examples = [e.get(TEXT) for e in td_train_2.intent_examples] test_1_intent_examples = [e.get(TEXT) for e in td_test_1.intent_examples] test_2_intent_examples = [e.get(TEXT) for e in td_test_2.intent_examples] assert train_1_intent_examples == train_2_intent_examples assert test_1_intent_examples == test_2_intent_examples
def test_training_data_fingerprint_incorporates_features(): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/examples/rasa/demo-rasa.yml", "data/examples/rasa/demo-rasa-responses.yml", ] training_data = training_data_from_paths(files, language="en") fp1 = training_data.fingerprint() big_array = np.random.random((128, 128)) f1 = Features(big_array, FEATURE_TYPE_SENTENCE, TEXT, "RegexFeaturizer") training_data.training_examples[0].add_features(f1) # training data fingerprint has changed assert fp1 != training_data.fingerprint()
def test_demo_data_filter_out_retrieval_intents(files): from rasa.shared.importers.utils import training_data_from_paths training_data = training_data_from_paths(files, language="en") assert len(training_data.training_examples) == 46 training_data_filtered = training_data.filter_training_examples( lambda ex: ex.get(INTENT_RESPONSE_KEY) is None) assert len(training_data_filtered.training_examples) == 42 training_data_filtered_2 = training_data.filter_training_examples( lambda ex: ex.get(INTENT_RESPONSE_KEY) is not None) assert len(training_data_filtered_2.training_examples) == 4 # make sure filtering operation doesn't mutate the source training data assert len(training_data.training_examples) == 46
async def get_nlu_data(self, languages=True) -> Dict[Text, TrainingData]: language = None if isinstance(languages, str): language = languages languages = [language] if not isinstance(languages, list): languages = self.nlu_config.keys() td = {} for lang in languages: try: td[lang] = utils.training_data_from_paths( self.path_for_nlu_lang(lang), lang, ) except ValueError as e: if str(e).startswith("Unknown data format"): td[lang] = TrainingData() if language: return td.get(language, TrainingData()) return td
def test_train_test_split(filepaths: List[Text]): from rasa.shared.importers.utils import training_data_from_paths trainingdata = training_data_from_paths(filepaths, language="en") assert trainingdata.intents == { "affirm", "greet", "restaurant_search", "goodbye", "chitchat", } assert trainingdata.entities == {"location", "cuisine"} assert set(trainingdata.responses.keys()) == { "utter_chitchat/ask_name", "utter_chitchat/ask_weather", } assert len(trainingdata.training_examples) == 46 assert len(trainingdata.intent_examples) == 46 assert len(trainingdata.response_examples) == 4 trainingdata_train, trainingdata_test = trainingdata.train_test_split( train_frac=0.8) assert (len(trainingdata_test.training_examples) + len(trainingdata_train.training_examples) == 46) assert len(trainingdata_train.training_examples) == 34 assert len(trainingdata_test.training_examples) == 12 assert len(trainingdata.number_of_examples_per_intent.keys()) == len( trainingdata_test.number_of_examples_per_intent.keys()) assert len(trainingdata.number_of_examples_per_intent.keys()) == len( trainingdata_train.number_of_examples_per_intent.keys()) assert len(trainingdata.number_of_examples_per_response.keys()) == len( trainingdata_test.number_of_examples_per_response.keys()) assert len(trainingdata.number_of_examples_per_response.keys()) == len( trainingdata_train.number_of_examples_per_response.keys())
def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: """Retrieves NLU training data (see parent class for full docstring).""" return utils.training_data_from_paths(self._nlu_files, language)
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: return utils.training_data_from_paths(self._nlu_files, language)