def test_train_tokenizer(text, expected_tokens, expected_indices): tk = WhitespaceTokenizer() message = Message(text) message.set(RESPONSE_ATTRIBUTE, text) message.set(INTENT_ATTRIBUTE, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get( TOKENS_NAMES[INTENT_ATTRIBUTE]) assert [t.text for t in tokens] == [text]
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) assert (1, 1) == vecs.shape assert np.all(vecs.toarray()[0] == np.array([1]))
def test_process( text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(text) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def reads(self, string: Text, **kwargs: Any) -> "TrainingData": """Reads TrainingData in YAML format from a string. Args: string: String with YAML training data. **kwargs: Keyword arguments. Returns: New `TrainingData` object with parsed training data. """ from rasa.nlu.training_data import TrainingData from rasa.validator import Validator self.validate(string) yaml_content = io_utils.read_yaml(string) if not Validator.validate_training_data_format_version( yaml_content, self.filename): return TrainingData() for key, value in yaml_content.items(): # pytype: disable=attribute-error if key == KEY_NLU: self._parse_nlu(value) elif key == KEY_RESPONSES: self.responses = value return TrainingData( self.training_examples, self.entity_synonyms, self.regex_features, self.lookup_tables, self.responses, )
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingData: from rasa.nlu.training_data import loading training_datas = [ loading.load_data(nlu_file, language) for nlu_file in paths ] merged_training_data = TrainingData().merge(*training_datas) merged_training_data.fill_response_phrases() return merged_training_data
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) assert seq_vecs is None assert sen_vec is None
def test_do_not_overwrite_any_entities(): message = Message("Max lives in Berlin.") message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_extract_patterns( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], expected_patterns: Dict[Text, Text], ): training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns(training_data) assert actual_patterns == expected_patterns
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) message.set(INTENT_ATTRIBUTE, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) if intent_features: assert (train_message.get("intent_sparse_features").toarray()[0] == intent_features) else: assert train_message.get("intent_sparse_features") is None if response_features: assert (train_message.get("response_sparse_features").toarray()[0] == response_features) else: assert train_message.get("response_sparse_features") is None
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def read(self, fn: Text, **kwargs: Any) -> "TrainingData": """Loads training data stored in the Dialogflow data format.""" from rasa.nlu.training_data import TrainingData language = kwargs["language"] fformat = kwargs["fformat"] if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}: raise ValueError( "fformat must be either {}, or {}" "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES) ) root_js = rasa.utils.io.read_json_file(fn) examples_js = self._read_examples_js(fn, language, fformat) if not examples_js: raise_warning( f"No training examples found for dialogflow file {fn}!", docs=DOCS_URL_MIGRATE_GOOGLE, ) return TrainingData() elif fformat == DIALOGFLOW_INTENT: return self._read_intent(root_js, examples_js) else: # path for DIALOGFLOW_ENTITIES return self._read_entities(root_js, examples_js)
def read_from_json(self, js, **kwargs): """Loads training data stored in the rasa NLU data format.""" from rasa.nlu.training_data import Message, TrainingData validate_rasa_nlu_data(js) data = js["rasa_nlu_data"] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: logger.warning("DEPRECATION warning: your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.") all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex["text"], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
def reads(self, s: Text, **kwargs: Any) -> "TrainingData": """Read markdown string and create TrainingData object""" from rasa.nlu.training_data import TrainingData s = self._strip_comments(s) for line in s.splitlines(): line = decode_string(line.strip()) header = self._find_section_header(line) if header: self._set_current_section(header[0], header[1]) else: self._parse_item(line) self._load_files(line) if self._deprecated_synonym_format_was_used: raise_warning( "You are using the deprecated training data format to declare synonyms." " Please use the following format: \n" '[<entity-text>]{"entity": "<entity-type>", "value": ' '"<entity-synonym>"}.' "\nYou can use the following command to update your training data file:" "\nsed -i -E 's/\\[([^)]+)\\]\\(([^)]+):([^)]+)\\)/[\\1]{" '"entity": "\\2", "value": "\\3"}/g\' nlu.md', category=FutureWarning, docs=DOCS_URL_TRAINING_DATA_NLU, ) return TrainingData( self.training_examples, self.entity_synonyms, self.regex_features, self.lookup_tables, )
def test_whitespace_training(supervised_embeddings_config): examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [ {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"} ], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [ {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"} ], }, ), ] tk = WhitespaceTokenizer() tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any" assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican" assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I" assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any): """Loads training data stored in the WIT.ai data format.""" from rasa.nlu.training_data import Message, TrainingData training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [ e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent') ] for e in entities: # for some reason wit adds additional quotes around entities e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None: faker_ = Faker() faker_.name.return_value = "Nikola Tesla" training_data = TrainingData( training_examples=[ Message.build("hello", "intent_test"), Message.build("hello @name", "intent_test"), Message.build("hello"), ] ) load_data.return_value = training_data importer = PlaceholderImporter() importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]} importer._nlu_files = ["test"] new_training_data = await importer.get_nlu_data() faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT) load_data.assert_called_once_with("test", "en") message: Message expected_messages = [ Message.build("hello", "intent_test"), Message.build("hello Nikola Tesla", "intent_test"), Message.build("hello"), ] for message, expected in zip(new_training_data.training_examples, expected_messages): assert message.get("intent") == expected.get("intent") assert message.get("text") == expected.get("text")
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("pretrained_embeddings_spacy") ner_syn = component_builder.create_component(_config.for_component(5), _config) examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }, ), ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
def test_convert_featurizer_train(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_build_tag_id_dict(): message_1 = Message("Germany is part of the European Union") message_1.set( BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"], ) message_2 = Message("Berlin is the capital of Germany") message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"]) training_data = TrainingData([message_1, message_2]) tag_id_dict = bilou_utils.build_tag_id_dict(training_data) assert tag_id_dict == { "O": 0, "B-location": 1, "I-location": 2, "U-location": 3, "L-location": 4, "B-organisation": 5, "I-organisation": 6, "U-organisation": 7, "L-organisation": 8, }
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. """ if self.retrieval_intent: training_data = training_data.filter_by_intent(self.retrieval_intent) label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=RESPONSE ) if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping) self._label_data = self._create_label_data( training_data, label_id_index_mapping, attribute=RESPONSE ) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=RESPONSE, ) self._check_input_dimension_consistency(model_data) return model_data
def test_spacy_featurizer_train(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) message.set(INTENT_ATTRIBUTE, "intent") message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence)) message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence)) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: nlu_data = [importer.get_nlu_data(language) for importer in self._importers] nlu_data = await asyncio.gather(*nlu_data) return reduce( lambda merged, other: merged.merge(other), nlu_data, TrainingData() )