async def get_nlu_data(self, languages = True) -> Dict[Text, TrainingData]: if isinstance(languages, str) and languages.startswith('data_for_'): lang = languages.replace('data_for_', '') return utils.training_data_from_paths([self.nlu_config[lang]['path']], 'xx') if not isinstance(languages, list): languages = self.nlu_config.keys() td = {} for lang in languages: try: td[lang] = utils.training_data_from_paths([self.nlu_config[lang]['path']], 'xx') except ValueError as e: if str(e).startswith("Unknown data format"): from rasa.nlu.training_data import TrainingData td[lang] = TrainingData() return td
def test_train_test_split(filepaths): from rasa.importers.utils import training_data_from_paths td = training_data_from_paths(filepaths, language="en") assert td.intents == { "affirm", "greet", "restaurant_search", "goodbye", "chitchat" } assert td.entities == {"location", "cuisine"} assert set( td.responses.keys()) == {"chitchat/ask_name", "chitchat/ask_weather"} assert len(td.training_examples) == 46 assert len(td.intent_examples) == 46 assert len(td.response_examples) == 4 td_train, td_test = td.train_test_split(train_frac=0.8) assert len(td_test.training_examples) + len( td_train.training_examples) == 46 assert len(td_train.training_examples) == 34 assert len(td_test.training_examples) == 12 assert len(td.number_of_examples_per_intent.keys()) == len( td_test.number_of_examples_per_intent.keys()) assert len(td.number_of_examples_per_intent.keys()) == len( td_train.number_of_examples_per_intent.keys()) assert len(td.number_of_examples_per_response.keys()) == len( td_test.number_of_examples_per_response.keys()) assert len(td.number_of_examples_per_response.keys()) == len( td_train.number_of_examples_per_response.keys())
def test_demo_data(files): from rasa.importers.utils import training_data_from_paths td = training_data_from_paths(files, language="en") assert td.intents == { "affirm", "greet", "restaurant_search", "goodbye", "chitchat" } assert td.entities == {"location", "cuisine"} assert set( td.responses.keys()) == {"chitchat/ask_name", "chitchat/ask_weather"} assert len(td.training_examples) == 46 assert len(td.intent_examples) == 46 assert len(td.response_examples) == 4 assert len(td.entity_examples) == 11 assert len(td.responses) == 2 assert td.entity_synonyms == { "Chines": "chinese", "Chinese": "chinese", "chines": "chinese", "vegg": "vegetarian", "veggie": "vegetarian", } assert td.regex_features == [ { "name": "greet", "pattern": r"hey[^\s]*" }, { "name": "zipcode", "pattern": r"[0-9]{5}" }, ]
async def save_from_path(self, path: Text, bot: Text, overwrite: bool = True, user="******"): try: story_files, nlu_files = get_core_nlu_files( os.path.join(path, DEFAULT_DATA_PATH)) nlu = utils.training_data_from_paths(nlu_files, "en") domain = Domain.from_file(os.path.join(path, DEFAULT_DOMAIN_PATH)) domain.check_missing_templates() story_steps = await StoryFileReader.read_from_files( story_files, domain) config = read_config_file(os.path.join(path, DEFAULT_CONFIG_PATH)) self.save_domain(domain, bot, user) self.save_stories(story_steps, bot, user) self.save_nlu(nlu, bot, user) self.save_config(config, bot, user) except InvalidDomain as e: logging.info(e) raise AppException("""Failed to validate yaml file. Please make sure the file is initial and all mandatory parameters are specified""" ) except Exception as e: logging.info(e) raise AppException(e)
def view_file(self, file, lang='en'): """ $ python -m saai.nlu_data_procs view_file ./nlu_multilang/en/nlu_data.md $ python -m saai.nlu_data_procs view_file /pi/ws/knowledgebasebot/data/nlu.md :param file: :param lang: :return: """ from pprint import pprint # files = ['./nlu_multilang/en/nlu_data.md'] td = training_data_from_paths([file], language=lang) print('.. examples') # print(*[e.text for e in td.training_examples], sep='\n') print(*[(e.get("intent"), e.text) for e in td.training_examples], sep='\n') tc.emp('green', '.. intents') for intent in td.intents: tc.emp('yellow', f" - {intent}") tc.emp('green', '.. entities') print(td.entities) tc.emp('green', '.. lookup_tables') pprint(td.lookup_tables)
def test_train_test_split(filepaths): from rasa.importers.utils import training_data_from_paths td = training_data_from_paths(filepaths, language="en") assert td.intents == {"affirm", "greet", "restaurant_search", "goodbye", "chitchat"} assert td.entities == {"location", "cuisine"} assert len(td.training_examples) == 46 assert len(td.intent_examples) == 46 td_train, td_test = td.train_test_split(train_frac=0.8) assert len(td_train.training_examples) == 35 assert len(td_test.training_examples) == 11
def test_train_test_split_with_random_seed(filepaths): from rasa.importers.utils import training_data_from_paths td = training_data_from_paths(filepaths, language="en") td_train_1, td_test_1 = td.train_test_split(train_frac=0.8, random_seed=1) td_train_2, td_test_2 = td.train_test_split(train_frac=0.8, random_seed=1) train_1_intent_examples = [e.get(TEXT) for e in td_train_1.intent_examples] train_2_intent_examples = [e.get(TEXT) for e in td_train_2.intent_examples] test_1_intent_examples = [e.get(TEXT) for e in td_test_1.intent_examples] test_2_intent_examples = [e.get(TEXT) for e in td_test_2.intent_examples] assert train_1_intent_examples == train_2_intent_examples assert test_1_intent_examples == test_2_intent_examples
def test_demo_data_filter_out_retrieval_intents(files): from rasa.importers.utils import training_data_from_paths training_data = training_data_from_paths(files, language="en") assert len(training_data.training_examples) == 46 training_data_filtered = training_data.filter_training_examples( lambda ex: ex.get(INTENT_RESPONSE_KEY) is None) assert len(training_data_filtered.training_examples) == 42 training_data_filtered_2 = training_data.filter_training_examples( lambda ex: ex.get(INTENT_RESPONSE_KEY) is not None) assert len(training_data_filtered_2.training_examples) == 4 # make sure filtering operation doesn't mutate the source training data assert len(training_data.training_examples) == 46
async def get_nlu_data(self, languages=True) -> Dict[Text, TrainingData]: language = None if isinstance(languages, str): language = languages languages = [language] if not isinstance(languages, list): languages = self.nlu_config.keys() td = {} for lang in languages: try: td[lang] = utils.training_data_from_paths( self.path_for_nlu_lang(lang), lang, ) except ValueError as e: if str(e).startswith("Unknown data format"): td[lang] = TrainingData() if language: return td.get(language, TrainingData()) return td
def save_from_path(self, path: Text, bot: Text, user="******"): try: story_files, nlu_files = get_core_nlu_files( os.path.join(path, DEFAULT_DATA_PATH)) nlu = utils.training_data_from_paths(nlu_files, "en") domain = Domain.from_file(os.path.join(path, DEFAULT_DOMAIN_PATH)) loop = asyncio.new_event_loop() story_steps = loop.run_until_complete( StoryFileReader.read_from_files(story_files, domain)) self.save_domain(domain, bot, user) self.save_stories(story_steps, bot, user) self.save_nlu(nlu, bot, user) self.__save_config( read_config_file(os.path.join(path, DEFAULT_CONFIG_PATH)), bot, user) except InvalidDomain as e: logging.info(e) raise AppException("""Failed to validate yaml file. Please make sure the file is initial and all mandatory parameters are specified""" ) except Exception as e: logging.info(e) raise AppException(e)
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: return utils.training_data_from_paths(self._nlu_files, language)
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: from rasa.importers import utils return utils.training_data_from_paths(self.nlu_files, language)