def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingData: from rasa.nlu.training_data import loading training_datas = [ loading.load_data(nlu_file, language) for nlu_file in paths ] merged_training_data = TrainingData().merge(*training_datas) merged_training_data.fill_response_phrases() return merged_training_data
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: fake_data_count = self.DEFAULT_FAKE_DATA_COUNT for importer in self.config["importers"]: if importer.get("name") == "rasam.PlaceholderImporter": fake_data_count = importer.get("fake_data_count", self.DEFAULT_FAKE_DATA_COUNT) faker_ = faker.Faker() faker_.seed_instance(fake_data_count) training_data = [ loading.load_data(nlu_file, language) for nlu_file in self._nlu_files ] new_training_data = [] for data in training_data: training_examples = [] example: Message for example in data.training_examples: if example.get("intent"): matches = [ i async for i in self.find_placeholders(example.text) ] if matches: async for new_message in self.replace_placeholders( example, faker_, matches, fake_data_count): training_examples.append(new_message) else: training_examples.append(example) else: training_examples.append(example) new_training_data.append( TrainingData(training_examples, data.entity_synonyms, data.regex_features, data.lookup_tables, data.nlg_stories)) merged_training_data = TrainingData().merge(*new_training_data) merged_training_data.fill_response_phrases() return merged_training_data
def load_data(resource_name: Text, language: Optional[Text] = "en") -> "TrainingData": """Load training data from disk. Merges them if loaded from disk and multiple files are found.""" from rasa.nlu.training_data import TrainingData if not os.path.exists(resource_name): raise ValueError(f"File '{resource_name}' does not exist.") files = io_utils.list_files(resource_name) data_sets = [_load(f, language) for f in files] data_sets = [ds for ds in data_sets if ds] if len(data_sets) == 0: training_data = TrainingData() elif len(data_sets) == 1: training_data = data_sets[0] else: training_data = data_sets[0].merge(*data_sets[1:]) if training_data.nlg_stories: training_data.fill_response_phrases() return training_data
def import_data(self, faq_path) -> None: config_path = os.path.join("bots/es-faq", "config.yml") importer = FAQImporter(config_path) importer.faq_paths = [faq_path] importer.synonym_paths = [] importer.augment_with_synonyms = False importer.timezone = "Asia/Taipei" training_examples, nlg_stories, entity_synonyms = importer._load_files( ) training_data = TrainingData(training_examples=training_examples, entity_synonyms=entity_synonyms, regex_features=[], lookup_tables=[], nlg_stories=nlg_stories) training_data.fill_response_phrases() # training_data = load_data("data/train_zh_cn.json") # td_responses = load_data("data/responses_zh_cn.md") # training_data = training_data.merge(td_responses) # training_data.fill_response_phrases() # train/test split training_examples = [ ex for ex in training_data.training_examples if ex.get('intent') == 'faq' ] examples_per_response_key = defaultdict(lambda: []) for ex in training_examples: examples_per_response_key[ex.get('response_key')].append(ex) for intent in examples_per_response_key: examples = examples_per_response_key[intent] random.Random(1).shuffle(examples) split_offset = max(1, int(0.8 * len(examples))) self.train_exs.extend(examples[:split_offset]) self.test_exs.extend(examples[split_offset:])