def read_nlu_data(): try: cache_dir = sh.ls(FLAGS.cache_dir) if 'id2class.set' in cache_dir and 'intent_examples.set' in cache_dir: id2class_path = os.path.join(FLAGS.cache_dir, 'id2class.set') id2class_lock_path = id2class_path + '.lock' intent_examples_path = os.path.join(FLAGS.cache_dir, 'intent_examples.set') intent_examples_lock_path = intent_examples_path + '.lock' with FileLock(id2class_lock_path): id2class = torch.load(id2class_path) with FileLock(intent_examples_lock_path): intent_examples = torch.load(intent_examples_path) return id2class, intent_examples except Exception as e: logging.error(e) sh.mkdir(FLAGS.cache_dir) data = load_data(FLAGS.data_dir, 'zh') id2class = dict(enumerate(data.intents)) intent_examples = data.intent_examples torch.save(id2class, os.path.join(FLAGS.cache_dir, 'id2class.set')) torch.save(intent_examples, os.path.join(FLAGS.cache_dir, 'intent_examples.set')) return id2class, intent_examples
def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingData: from rasa.nlu.training_data import loading training_data_sets = [ loading.load_data(nlu_file, language) for nlu_file in paths ] return TrainingData().merge(*training_data_sets)
def _read_nlu_data(self): try: cache_dir = sh.ls(FLAGS.cache_dir) if 'id2entity.set' in cache_dir and 'entity_examples.set' in cache_dir and 'id2class.set' in cache_dir and 'intent_examples.set' in cache_dir: id2entity_path = os.path.join(FLAGS.cache_dir, 'id2entity.set') id2entity_lock_path = id2entity_path + '.lock' entity_examples_path = os.path.join(FLAGS.cache_dir, 'entity_examples.set') entity_examples_lock_path = entity_examples_path + '.lock' id2class_path = os.path.join(FLAGS.cache_dir, 'id2class.set') id2class_lock_path = id2class_path + '.lock' intent_examples_path = os.path.join(FLAGS.cache_dir, 'intent_examples.set') intent_examples_lock_path = intent_examples_path + '.lock' with FileLock(id2entity_lock_path): id2entity = torch.load(id2entity_path) with FileLock(entity_examples_lock_path): entity_examples = torch.load(entity_examples_path) with FileLock(id2class_lock_path): id2class = torch.load(id2class_path) with FileLock(intent_examples_lock_path): intent_examples = torch.load(intent_examples_path) return id2entity, entity_examples, id2class, intent_examples except Exception as e: logging.error(e) sh.mkdir(FLAGS.cache_dir) data = load_data(FLAGS.data_dir, 'zh') entity_lists, entity_examples_cooked, intent_examples = ['O'], [], [] for item in data.training_examples: training_text = item.text training_data = item.data entity_examples_cooked.append(self._predata(training_text, training_data.get("entities", []))) intent_examples.append(training_data.get("intent", None)) for entity in data.entities: for tag in ['B', 'I']: entity_lists.append(tag + '-' + entity) id2entity = dict(enumerate(entity_lists)) id2class = dict(enumerate(data.intents)) torch.save(id2entity, os.path.join(FLAGS.cache_dir, 'id2entity.set')) torch.save(entity_examples_cooked, os.path.join(FLAGS.cache_dir, 'entity_examples.set')) torch.save(id2class, os.path.join(FLAGS.cache_dir, 'id2class.set')) torch.save(intent_examples, os.path.join(FLAGS.cache_dir, 'intent_examples.set')) return id2entity, entity_examples_cooked, id2class, intent_examples
def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingData: from rasa.nlu.training_data import loading training_datas = [ loading.load_data(nlu_file, language) for nlu_file in paths ] merged_training_data = TrainingData().merge(*training_datas) merged_training_data.fill_response_phrases() return merged_training_data
def split_nlu_data(args): from rasa.nlu.training_data.loading import load_data data_path = get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH) data_path = data.get_nlu_directory(data_path) nlu_data = load_data(data_path) train, test = nlu_data.train_test_split(args.training_fraction) train.persist(args.out, filename="training_data.json") test.persist(args.out, filename="test_data.json")
def split_nlu_data(args: argparse.Namespace) -> None: from rasa.nlu.training_data.loading import load_data from rasa.nlu.training_data.util import get_file_format data_path = rasa.cli.utils.get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH) data_path = data.get_nlu_directory(data_path) nlu_data = load_data(data_path) fformat = get_file_format(data_path) train, test = nlu_data.train_test_split(args.training_fraction, args.random_seed) train.persist(args.out, filename=f"training_data.{fformat}") test.persist(args.out, filename=f"test_data.{fformat}")
def __init__(self, agentName, botconfig, data, **kwargs): logger.info("Training Agent " + agentName + " in progress") trainingData = load_data(data) self.intents = list(trainingData.intents) self.entities = list(trainingData.entities) trainer = Trainer(config.load(botconfig)) self.interpreter = trainer.train(trainingData) self.model_path = "./models/" + agentName + "/" persist_path = trainer.persist(self.model_path) self.tar_path = package_model(fingerprint=None, train_path=persist_path, output_directory=self.model_path) self.model_name = self.tar_path.replace(self.model_path, "") self.model_version = self.model_name[:self.model_name.index(".tar.gz")]
def split_nlu_data(args): from rasa.nlu.training_data.loading import load_data from rasa.nlu.training_data.util import get_file_format data_path = get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH) data_path = data.get_nlu_directory(data_path) nlu_data = load_data(data_path) fformat = get_file_format(data_path) train, test = nlu_data.train_test_split(args.training_fraction) train.persist(args.out, filename="training_data.{}".format(fformat), fformat=fformat) test.persist(args.out, filename="test_data.{}".format(fformat), fformat=fformat)
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: fake_data_count = self.DEFAULT_FAKE_DATA_COUNT for importer in self.config["importers"]: if importer.get("name") == "rasam.PlaceholderImporter": fake_data_count = importer.get("fake_data_count", self.DEFAULT_FAKE_DATA_COUNT) faker_ = faker.Faker() faker_.seed_instance(fake_data_count) training_data = [ loading.load_data(nlu_file, language) for nlu_file in self._nlu_files ] new_training_data = [] for data in training_data: training_examples = [] example: Message for example in data.training_examples: if example.get("intent"): matches = [ i async for i in self.find_placeholders(example.text) ] if matches: async for new_message in self.replace_placeholders( example, faker_, matches, fake_data_count): training_examples.append(new_message) else: training_examples.append(example) else: training_examples.append(example) new_training_data.append( TrainingData(training_examples, data.entity_synonyms, data.regex_features, data.lookup_tables, data.nlg_stories)) merged_training_data = TrainingData().merge(*new_training_data) merged_training_data.fill_response_phrases() return merged_training_data
async def _write_nlu_to_file( export_nlu_path: Text, evts: List[Dict[Text, Any]] ) -> None: """Write the nlu data of the sender_id to the file paths.""" from rasa.nlu.training_data import TrainingData msgs = _collect_messages(evts) # noinspection PyBroadException try: previous_examples = load_data(export_nlu_path) except Exception as e: logger.exception("An exception occurred while trying to load the " "NLU data.") export_nlu_path = questionary.text( message="Could not load existing NLU data, please " "specify where to store NLU data learned in " "this session (this will overwrite any " "existing file). {}".format(str(e)), default=PATHS["backup"]).ask() if export_nlu_path is None: return previous_examples = TrainingData() nlu_data = previous_examples.merge(TrainingData(msgs)) # need to guess the format of the file before opening it to avoid a read # in a write if _guess_format(export_nlu_path) in {"md", "unk"}: fformat = "md" else: fformat = "json" with open(export_nlu_path, 'w', encoding="utf-8") as f: if fformat == "md": f.write(nlu_data.as_markdown()) else: f.write(nlu_data.as_json())
def split_nlu_data(args: argparse.Namespace) -> None: """Load data from a file path and split the NLU data into test and train examples. Args: args: Commandline arguments """ from rasa.nlu.training_data.loading import load_data from rasa.nlu.training_data.util import get_file_format data_path = rasa.cli.utils.get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH) data_path = data.get_nlu_directory(data_path) nlu_data = load_data(data_path) fformat = get_file_format(data_path) train, test = nlu_data.train_test_split(args.training_fraction, args.random_seed) train.persist(args.out, filename=f"training_data.{fformat}") test.persist(args.out, filename=f"test_data.{fformat}")
def _read_nlu_data(self): try: cache_dir = sh.ls(FLAGS.cache_dir) if 'id2entity.set' in cache_dir and 'entity_examples.set' in cache_dir: id2entity_path = os.path.join(FLAGS.cache_dir, 'id2entity.set') id2entity_lock_path = id2entity_path + '.lock' entity_examples_path = os.path.join(FLAGS.cache_dir, 'entity_examples.set') entity_examples_lock_path = entity_examples_path + '.lock' with FileLock(id2entity_lock_path): id2entity = torch.load(id2entity_path) with FileLock(entity_examples_lock_path): entity_examples = torch.load(entity_examples_path) return id2entity, entity_examples except Exception as e: logging.error(e) sh.mkdir(FLAGS.cache_dir) data = load_data(FLAGS.data_dir, 'zh') entities, entity_examples = data.entities, data.entity_examples entity_lists, entity_examples_cooked = ['O'], [] for example in entity_examples: entity_examples_cooked.append(self._predata(example.text, example.get("entities", []))) for entity in entities: for tag in ['B', 'I']: entity_lists.append(tag + '-' + entity) id2entity = dict(enumerate(entity_lists)) torch.save(id2entity, os.path.join(FLAGS.cache_dir, 'id2entity.set')) torch.save(entity_examples_cooked, os.path.join(FLAGS.cache_dir, 'entity_examples.set')) return id2entity, entity_examples_cooked
def test_load_data_from_non_existing_file(): with pytest.raises(ValueError): load_data("some path")
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: from rasa.nlu.training_data import loading path_to_nlu_file = self._custom_get_nlu_file() return loading.load_data(path_to_nlu_file)
def generate_domain(self) -> Text: logger.debug("Generating domain file") results = self.get_tagged_entries(self.tag_dict) # Get tagged entities # if nlu_file is specified, look for intents and entities within the nlu file. if self.nlu_file: logger.debug("Extracting entities and intents from nlu training data (%s)" % self.nlu_file) nlu_data = loading.load_data(self.nlu_file) if nlu_data: if len(results["entities"]) == 0: results["entities"] = list(nlu_data.entities) if len(results["intents"]) == 0: results["intents"] = list(nlu_data.intents) # if actions_file is specified, look for registed actions within actions file. # Keep only the actions / forms that were found within the NLU file if self.actions_dir: logger.debug("Extracting actions from action directory (%s)" % self.actions_dir) actions = self.get_actions(self.actions_dir) if "actions" in actions.keys(): results["actions"] = actions["actions"] if "forms" in actions.keys(): results["forms"] = actions["forms"] logger.debug("Merging identified utterances") # If templates exist, append them to the actions if "templates" in results.keys() and len(results["templates"]) > 0: results["actions"] = results["actions"] + list(results["templates"].keys()) logger.debug("Formatting output") # Iterate through output, identify existing tags, and remove keys that dont exist for tag in VALID_SEARCH_TAGS: if tag in results.keys() and len(results[tag]) > 0: print("Found %s %s" % (len(results[tag]), tag)) else: # remove the keys from the list del results[tag] logger.warning("No %s found" % tag) # output the results to std out, if an output file was specified, send it there yaml = YAML() yaml.compact(seq_seq=False, seq_map=False) if self.output: #output to file if os.path.isdir(self.output): logger.error("Output location (%s) is a directory.. can not overwrite" % self.output) return "Output location (%s) is a directory.. can not overwrite" % self.output elif os.path.isfile(self.output): logger.warning("Output file %s already exists, overwritting..." % self.output) output_path = self.output if os.path.isabs(self.output) else os.path.join(os.path.abspath(os.curdir), self.output) try: stream = open(output_path, "w") except IOError: stream = open(Path(output_path), "w") #Create the file! yaml.dump(results, stream) print("Results saved to %s" % self.output) else: # yaml.dump(results, sys.stdout) yaml.dump(results, sys.stdout)