def _write_nlu_to_file( export_nlu_path: Text, evts: List[Dict[Text, Any]] ) -> None: """Write the nlu data of the sender_id to the file paths.""" msgs = _collect_messages(evts) # noinspection PyBroadException try: previous_examples = load_data(export_nlu_path) except Exception as e: logger.exception("An exception occurred while trying to load the " "NLU data.") export_nlu_path = questionary.text( message="Could not load existing NLU data, please " "specify where to store NLU data learned in " "this session (this will overwrite any " "existing file). {}".format(str(e)), default=PATHS["backup"]).ask() if export_nlu_path is None: return previous_examples = TrainingData() nlu_data = previous_examples.merge(TrainingData(msgs)) with io.open(export_nlu_path, 'w', encoding="utf-8") as f: if _guess_format(export_nlu_path) in {"md", "unk"}: f.write(nlu_data.as_markdown()) else: f.write(nlu_data.as_json())
def _write_nlu_to_file( export_nlu_path: Text, evts: List[Dict[Text, Any]] ) -> None: """Write the nlu data of the sender_id to the file paths.""" msgs = _collect_messages(evts) # noinspection PyBroadException try: previous_examples = load_data(export_nlu_path) except Exception: questions = [{"name": "export nlu", "type": "input", "message": "Could not load existing NLU data, please " "specify where to store NLU data learned in " "this session (this will overwrite any " "existing file)", "default": PATHS["backup"]}] answers = prompt(questions) export_nlu_path = answers["export nlu"] previous_examples = TrainingData() nlu_data = previous_examples.merge(TrainingData(msgs)) with io.open(export_nlu_path, 'w', encoding="utf-8") as f: if _guess_format(export_nlu_path) in {"md", "unk"}: f.write(nlu_data.as_markdown()) else: f.write(nlu_data.as_json())
def test_repeated_entities(): data = u""" { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": 3 } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = TrainingData(f.name, 'mitie', 'en') assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example["entities"] assert len(entities) == 1 start, end = mitie_trainer_utils.find_entity(entities[0], example["text"]) assert start == 9 assert end == 10
def test_nonascii_entities(): data = u""" { "luis_schema_version": "1.0", "utterances" : [ { "text": "I am looking for a ßäæ ?€ö) item", "intent": "unk", "entities": [ { "entity": "description", "startPos": 5, "endPos": 8 } ] } ] }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = TrainingData(f.name, 'mitie', 'en') assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example["entities"] assert len(entities) == 1 entity = entities[0] assert entity["value"] == u"ßäæ ?€ö)" assert entity["start"] == 19 assert entity["end"] == 27 assert entity["entity"] == "description"
def load_train_data(data): validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn( "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be " + "removed in the future. Consider putting all your examples into the 'common_examples' section." ) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = {} if e.get("intent"): data["intent"] = e["intent"] if e.get("entities") is not None: data["entities"] = e["entities"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def load_wit_data(filename): # type: (str) -> TrainingData """Loads training data stored in the WIT.ai data format.""" intent_examples = [] entity_examples = [] common_examples = [] with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0] if intents else None entities = [e for e in entities if ("start" in e and "end" in e)] for e in entities: e["value"] = e["value"][1:-1] if intent and entities: common_examples.append({"text": text, "intent": intent, "entities": entities}) elif intent: intent_examples.append({"text": text, "intent": intent}) elif entities: entity_examples.append({"text": text, "intent": intent, "entities": entities}) return TrainingData(intent_examples, entity_examples, common_examples)
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}], "spacy_doc": spacy_nlp("central indian restaurant") })] config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}} ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
def load_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = e.copy() if "text" in data: del data["text"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def test_multiword_entities(): data = u""" { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" filename = 'tmp_training_data.json' with open(filename, 'w') as f: f.write(data.encode("utf-8")) td = TrainingData(filename, 'mitie', 'en') assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example["entities"] assert len(entities) == 1 start, end = MITIETrainer.find_entity(entities[0], example["text"]) assert start == 4 assert end == 7
def test_repeated_entities(): data = u""" { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": 3 } ] } ] } }""" filename = 'tmp_training_data.json' with open(filename, 'w') as f: f.write(data.encode("utf-8")) td = TrainingData(filename, 'mitie', 'en') assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example["entities"] assert len(entities) == 1 start, end = MITIETrainer.find_entity(entities[0], example["text"]) assert start == 9 assert end == 10
def load_rasa_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) # build entity_synonyms dictionary entity_synonyms = {} for s in synonyms: if "value" in s and "synonyms" in s: for synonym in s["synonyms"]: entity_synonyms[synonym] = s["value"] if intent or entity: logger.warn( "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be " + "removed in the future. Consider putting all your examples into the 'common_examples' section." ) all_examples = common + intent + entity return TrainingData(all_examples, entity_synonyms)
def load_wit_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the WIT.ai data format.""" training_examples = [] data = _read_json_from_file(filename) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent')] for e in entities: # for some reason wit adds additional quotes around entity values e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def test_multiword_entities(): data = u""" { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = TrainingData(f.name, 'mitie', 'en') assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example["entities"] assert len(entities) == 1 start, end = mitie_trainer_utils.find_entity(entities[0], example["text"]) assert start == 4 assert end == 7
def do_GET(self): self._set_headers() data_file = sys.argv[1] training_data = TrainingData(data_file, 'mitie', 'en') data = create_html(training_data) self.wfile.write(data.encode('utf-8')) return
def read_from_json(self, js, **kwargs): """Loads training data stored in the rasa NLU data format.""" validate_rasa_nlu_data(js) data = js['rasa_nlu_data'] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: logger.warning("DEPRECATION warning: your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.") all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex['text'], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables)
def load_rasa_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" data = _read_json_from_file(filename) validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn("DEPRECATION warning: Data file contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider putting all your examples " "into the 'common_examples' section.") all_examples = common + intent + entity training_examples = [] for e in all_examples: data = e.copy() if "text" in data: del data["text"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def load_wit_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the WIT.ai data format.""" training_examples = [] with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [ e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent') ] for e in entities: e["value"] = e["value"].strip( "\"" ) # for some reason wit adds additional quotes around entity values data = {"text": text} if intent: data["intent"] = intent if entities: data["entities"] = entities training_examples.append(data) return TrainingData(training_examples)
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def load_markdown_data(filename): # type: (Text) -> TrainingData """Loads training data stored in markdown data format.""" from rasa_nlu.utils.md_to_json import MarkdownToJson data = MarkdownToJson(filename) return TrainingData(data.common_examples, get_entity_synonyms_dict(data.entity_synonyms))
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("all_components") ner_syn = component_builder.create_component("ner_synonyms", _config) examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }] }), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }] }) ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
def load_rasa_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn( "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be " + "removed in the future. Consider putting all your examples into the 'common_examples' section." ) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = {} if e.get("intent"): data["intent"] = e["intent"] if e.get("entities") is not None: data["entities"] = e["entities"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any): """Loads training data stored in the WIT.ai data format.""" from rasa_nlu.training_data import Message, TrainingData training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [ e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent') ] for e in entities: # for some reason wit adds additional quotes around entities e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def load_api_data(files): # type: ([str]) -> TrainingData """Loads training data stored in the API.ai data format.""" intent_examples = [] entity_examples = [] common_examples = [] entity_synonyms = {} for filename in files: with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) # get only intents, skip the rest. The property name is the target class if "userSays" in data: intent = data.get("name") for s in data["userSays"]: text = "".join([chunk["text"] for chunk in s.get("data")]) # add entities to each token, if available entities = [] for e in [ chunk for chunk in s.get("data") if "alias" in chunk or "meta" in chunk ]: start = text.find(e["text"]) end = start + len(e["text"]) val = text[start:end] entities.append({ "entity": e["alias"] if "alias" in e else e["meta"], "value": val, "start": start, "end": end }) if intent and entities: common_examples.append({ "text": text, "intent": intent, "entities": entities }) elif intent: intent_examples.append({"text": text, "intent": intent}) elif entities: entity_examples.append({ "text": text, "intent": intent, "entities": entities }) # create synonyms dictionary if "name" in data and "entries" in data: for entry in data["entries"]: if "value" in entry and "synonyms" in entry: for synonym in entry["synonyms"]: entity_synonyms[synonym] = entry["value"] return TrainingData(intent_examples, entity_examples, common_examples, entity_synonyms)
def do_train(config): trainer = create_trainer(config) persistor = create_persistor(config) training_data = TrainingData(config.data, config.backend, config.language) trainer.train(training_data) trainer.persist(config.path, persistor)
def _read_entities(entity_js, examples_js): from rasa_nlu.training_data import TrainingData entity_synonyms = transform_entity_synonyms(examples_js) name = entity_js.get("name") lookup_tables = DialogflowReader._extract_lookup_tables( name, examples_js) return TrainingData([], entity_synonyms, [], lookup_tables)
def load_data(resource_name, language='en'): # type: (Text, Optional[Text]) -> TrainingData """Load training data from disk. Merges them if loaded from disk and multiple files are found.""" files = utils.list_files(resource_name) data_sets = [_load(f, language) for f in files] data_sets = [ds for ds in data_sets if ds] if len(data_sets) == 0: training_data = TrainingData() elif len(data_sets) == 1: training_data = data_sets[0] else: training_data = data_sets[0].merge(*data_sets[1:]) training_data.validate() return training_data
def _read_intent(self, intent_js, examples_js): """Reads the intent and examples from respective jsons.""" intent = intent_js.get("name") training_examples = [] for ex in examples_js: text, entities = self._join_text_chunks(ex['data']) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
def test_train_with_empty_data(component_builder): _config = utilities.base_test_conf("all_components") trainer = Trainer(_config, component_builder) trainer.train(TrainingData()) persistor = create_persistor(_config) persisted_path = trainer.persist(_config['path'], persistor, project_name=_config['name']) loaded = utilities.load_interpreter_for_model(_config, persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) trainer.train(TrainingData()) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor, project_name="my_project") loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def do_train(config): """Loads the trainer and the data and runs the training of the specified model.""" trainer = create_trainer(config) persistor = create_persistor(config) training_data = TrainingData(config.data, config.backend, config.language) trainer.train(training_data) trainer.persist(config.path, persistor) return trainer