def load_data(resource_name: Text, language: Optional[Text] = "en") -> "TrainingData": """Load training data from disk. Merges them if loaded from disk and multiple files are found.""" if not os.path.exists(resource_name): raise ValueError(f"File '{resource_name}' does not exist.") if os.path.isfile(resource_name): files = [resource_name] else: files = rasa.shared.utils.io.list_files(resource_name) data_sets = [_load(f, language) for f in files] data_sets = [ds for ds in data_sets if ds] if len(data_sets) == 0: training_data = TrainingData() elif len(data_sets) == 1: training_data = data_sets[0] else: training_data = data_sets[0].merge(*data_sets[1:]) return training_data
def test_features_for_messages_with_missing_part_of_speech_tags( create_lexical_syntactic_featurizer: Callable[[Dict[ Text, Any]], LexicalSyntacticFeaturizer], ): # build the message and do NOT add part of speech information sentence = "hello goodbye hello" message_data = { TOKENS_NAMES[TEXT]: [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] } message = Message(data=message_data) # train and process featurizer = create_lexical_syntactic_featurizer({ "alias": "lsf", "features": [["BOS", "pos"]] }) featurizer.train(TrainingData([message])) featurizer.process([message]) feature = message.features[0] assert feature.features.shape[1] == 3 # BOS = True/False, pos = None
def test_cvf_shared_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { "text": additional_size, "response": additional_size, "action_text": additional_size, }, "use_shared_vocab": True, }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) shared_vocabulary = featurizer.vectorizers["text"].vocabulary_ assert len(shared_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(shared_vocabulary) == real_vocabulary_size)
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({ "use_lemma": use_lemma, "additional_vocabulary_size": { "text": 0 } }) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_convert_featurizer_process( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr(ConveRTFeaturizer, "_validate_model_url", lambda _: RESTRICTED_ACCESS_URL) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) sentence = "Hey how are you today ?" message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) featurizer.process([message]) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def test_convert_featurizer_tokens_to_text( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr(ConveRTFeaturizer, "_validate_model_url", lambda _: None) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def create_zip_file(nlu: TrainingData, domain: Domain, stories: StoryGraph, config: Dict, bot: Text): """ adds training files to zip :param nlu: nlu data :param domain: domain data :param stories: stories data :param config: config data :param bot: bot id :return: None """ directory = Utility.save_files( nlu.nlu_as_markdown().encode(), domain.as_yaml().encode(), stories.as_story_string().encode(), yaml.dump(config).encode(), ) zip_path = os.path.join(tempfile.gettempdir(), bot) zip_file = shutil.make_archive(zip_path, format="zip", root_dir=directory) shutil.rmtree(directory) return zip_file
def convert_for_training( self, domain: Domain, story_graph: StoryGraph, ) -> TrainingData: """Creates de-duplicated training data. Each possible user text and intent and each action name and action text that can be found in the given domain and story graph appears exactly once in the resulting training data. Moreover, each item is contained in a separate messsage. Args: domain: the domain story_graph: a story graph Returns: training data """ container = MessageContainerForCoreFeaturization() # collect all action and user (intent-only) substates known from domain container.derive_messages_from_domain_and_add(domain=domain) # collect all substates we see in the given data all_events = ( event for step in story_graph.story_steps for event in step.events if isinstance(event, UserUttered) # because all action names and texts are known to the domain ) container.derive_messages_from_events_and_add(events=all_events) # Reminder: in case of complex recipes that train CountVectorizers, we'll have # to make sure that there is at least one user substate with a TEXT to ensure # `CountVectorizer` is trained... return TrainingData(training_examples=container.all_messages())
def test_only_featurizes_text_attribute( create_lexical_syntactic_featurizer: Callable[[Dict[ Text, Any]], LexicalSyntacticFeaturizerGraphComponent], ): # build a message with tokens for lots of attributes sentence = "hello goodbye hello" tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] message_data = {} for attribute in MESSAGE_ATTRIBUTES + DENSE_FEATURIZABLE_ATTRIBUTES: message_data[attribute] = sentence message_data[TOKENS_NAMES[attribute]] = tokens message = Message(data=message_data) # train and process featurizer = create_lexical_syntactic_featurizer({ "alias": "lsf", "features": [["BOS"]] }) featurizer.train(TrainingData([message])) featurizer.process([message]) assert len(message.features) == 1 assert message.features[0].attribute == TEXT
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(test_message) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_log_longer_sequence( sequence_length: int, model_name: Text, model_weights: Text, should_overflow: bool, caplog: LogCaptureFixture, create_language_model_featurizer: Callable[ [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent ], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): config = {"model_name": model_name, "model_weights": model_weights} featurizer = create_language_model_featurizer(config) text = " ".join(["hi"] * sequence_length) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) caplog.set_level(logging.DEBUG) featurizer.process([message]) if should_overflow: assert "hi hi hi" in caplog.text assert len(message.features) >= 2
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } transformers_config = { "model_name": "bert", "model_weights": "bert-base-uncased", } # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer(component_config) message = Message.build(text=text) message.set(INTENT, text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_text_featurizer_window_size(sentence, expected): featurizer = LexicalSyntacticFeaturizer( {"features": [["upper"], ["digit"], ["low"], ["digit"]]}) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray()[0] == expected)
def test_count_vector_featurizer_oov_words( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], }) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_warn_if_part_of_speech_features_cannot_be_computed( create_lexical_syntactic_featurizer: Callable[[Dict[Text, Any]], LexicalSyntacticFeaturizer], sentence: Text, feature_config: Dict[Text, Any], expected_features: np.ndarray, ): featurizer = create_lexical_syntactic_featurizer({ "alias": "lsf", "features": feature_config }) # build the message - with tokens but *no* part-of-speech tags tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] message = Message(data={TOKENS_NAMES[TEXT]: tokens}) # train with pytest.warns( UserWarning, match= "Expected training data to include tokens with part-of-speech tags", ): featurizer.train(TrainingData([message])) assert not message.features # process with pytest.warns(None) as records: featurizer.process([message]) assert len(records) == 0 assert len(message.features) == 1 feature = message.features[0] assert np.all(feature.features.todense() == expected_features)
def test_count_vector_featurizer_shared_vocab( sentence, intent, response, text_features, intent_features, response_features ): ftr = CountVectorsFeaturizer({"use_shared_vocab": True,}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def _write_nlu_lookup_table_yaml(cls, lookup_table: Dict[Text, Any], output_dir_path: Path) -> None: """Converts and writes lookup tables examples from `txt` to `YAML` format. Args: lookup_table: Lookup tables items. output_dir_path: Path to the target output directory. """ lookup_table_file = lookup_table.get("elements") if not lookup_table_file or not isinstance(lookup_table_file, str): return examples_from_file = read_lookup_table_file(lookup_table_file) target_filename = cls.generate_path_for_converted_training_data_file( Path(lookup_table_file), output_dir_path) entity_name = Path(lookup_table_file).stem RasaYAMLWriter().dump( target_filename, TrainingData(lookup_tables=[{ "name": entity_name, "elements": examples_from_file }]), )
def _additional_training_data_from_stories(self) -> TrainingData: stories = self.get_stories() utterances, actions = _unique_events_from_stories(stories) # Sort events to guarantee deterministic behavior and to avoid that the NLU # model has to be retrained due to changes in the event order within # the stories. sorted_utterances = sorted( utterances, key=lambda user: user.intent_name or user.text or "" ) sorted_actions = sorted( actions, key=lambda action: action.action_name or action.action_text or "" ) additional_messages_from_stories = [ _messages_from_action(action) for action in sorted_actions ] + [_messages_from_user_utterance(user) for user in sorted_utterances] logger.debug( f"Added {len(additional_messages_from_stories)} training data examples " f"from the story training data." ) return TrainingData(additional_messages_from_stories)
def test_convert_training_examples( spacy_nlp: Language, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer.create( SpacyTokenizer.get_default_config(), default_model_storage, Resource("tokenizer"), default_execution_context, ) count_vectors_featurizer = CountVectorsFeaturizer.create( CountVectorsFeaturizer.get_default_config(), default_model_storage, Resource("count_featurizer"), default_execution_context, ) spacy_featurizer = SpacyFeaturizer.create( SpacyFeaturizer.get_default_config(), default_model_storage, Resource("spacy_featurizer"), default_execution_context, ) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.process_training_data(training_data) count_vectors_featurizer.train(training_data) count_vectors_featurizer.process_training_data(training_data) spacy_featurizer.process_training_data(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def test_is_empty(): assert TrainingData().is_empty()
def test_train_extract_load( create_or_load_mitie_extractor: Callable[[Dict[Text, Any]], MitieEntityExtractor], mitie_model: MitieModel, with_trainable_examples: bool, ): # some texts where last token is a city texts_ending_with_city = [ "Bert lives in Berlin", "Ernie asks where is Bielefeld" ] # create some messages with entities messages_with_entities = [] for text in texts_ending_with_city: tokens = [ Token(text=match.group(), start=match.start(), end=match.end()) for match in re.finditer(r"\w+", text) ] entities = [{ ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: tokens[-1].text, ENTITY_ATTRIBUTE_START: tokens[-1].start, ENTITY_ATTRIBUTE_END: tokens[-1].end, EXTRACTOR: None, # must be None or mitie_entity_extractor.name }] message = Message(text=text) message.data[TOKENS_NAMES[TEXT]] = tokens message.data[ENTITIES] = entities if with_trainable_examples: message.data[ INTENT] = "must have intent otherwise not an NLU example" else: pass # not adding an intent is sufficient to make this a "core example" messages_with_entities.append(message) # turn them into training data training_data = TrainingData(messages_with_entities) # train the extractor mitie_entity_extractor = create_or_load_mitie_extractor(config={}, load=False) mitie_entity_extractor.train(training_data, model=mitie_model) # create some messages "without entities" - for processing messages_without_entities = [ Message( data={ TEXT: message.data[TEXT], TOKENS_NAMES[TEXT]: message.data[TOKENS_NAMES[TEXT]], }) for message in messages_with_entities ] # process! mitie_entity_extractor.process(messages=messages_without_entities, model=mitie_model) # check that extractor added the expected entities to the messages # (that initially were) "with no entities" if with_trainable_examples: for processed_message, labeled_message in zip( messages_without_entities, messages_with_entities): # i.e. "without (before process)" assert ENTITIES in processed_message.data computed_entities = processed_message.data[ENTITIES] assert len(computed_entities) == 1 computed_entity = copy.copy( computed_entities[0]) # we need it later # check confidence assert computed_entity.pop(ENTITY_ATTRIBUTE_CONFIDENCE, "surprise") is None # check extractor assert computed_entity.pop(EXTRACTOR, None) == mitie_entity_extractor.name # compare the rest expected_entity = labeled_message.data[ENTITIES][0] expected_entity.pop(EXTRACTOR) assert computed_entity == expected_entity else: for processed_message in messages_without_entities: assert ENTITIES not in processed_message.data # load the same extractor again loaded_extractor = create_or_load_mitie_extractor(config={}, load=True) # check results are the same same_messages_without_entities = [ Message( data={ TEXT: message.data[TEXT], TOKENS_NAMES[TEXT]: message.data[TOKENS_NAMES[TEXT]], }) for message in messages_with_entities ] loaded_extractor.process(messages=same_messages_without_entities, model=mitie_model) assert same_messages_without_entities[0].data == messages_without_entities[ 0].data
def test_load_lookup_table( source_lookup_table: Dict[Text, Any], expected_lookup_table: Dict[Text, Any] ): assert TrainingData._load_lookup_table(source_lookup_table) == expected_lookup_table
async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData: return TrainingData()
def run() -> TrainingData: return TrainingData()
def run(param0: TrainingData, param1: TrainingData, param2: TrainingData) -> TrainingData: return TrainingData()
def run( self, some_param: TrainingData = TrainingData()) -> TrainingData: pass
def test_incremental_train_featurization(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) # Test featurization of message expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) new_patterns = [ { "pattern": "\\btoday*", "name": "day", "usage": "intent" }, { "pattern": "\\bhey+", "name": "hello", "usage": "intent" }, ] message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) loaded_featurizer.train( TrainingData([message], regex_features=patterns + new_patterns), RasaNLUModelConfig(), ) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[-2] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{ "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 0}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def validate_required_components_from_data( pipeline: List["Component"], data: TrainingData ) -> None: """Validates that all components are present in the pipeline based on data. Args: pipeline: The list of the :class:`rasa.nlu.components.Component`s. data: The :class:`rasa.shared.nlu.training_data.training_data.TrainingData`. """ if data.response_examples and not any_components_in_pipeline( ["ResponseSelector"], pipeline ): rasa.shared.utils.io.raise_warning( "You have defined training data with examples for training a response " "selector, but your NLU pipeline does not include a response selector " "component. To train a model on your response selector data, add a " "'ResponseSelector' to your pipeline." ) if data.entity_examples and not any_components_in_pipeline( TRAINABLE_EXTRACTORS, pipeline ): rasa.shared.utils.io.raise_warning( "You have defined training data consisting of entity examples, but " "your NLU pipeline does not include an entity extractor trained on " "your training data. To extract non-pretrained entities, add one of " f"{TRAINABLE_EXTRACTORS} to your pipeline." ) if data.entity_examples and not any_components_in_pipeline( {"DIETClassifier", "CRFEntityExtractor"}, pipeline ): if data.entity_roles_groups_used(): rasa.shared.utils.io.raise_warning( "You have defined training data with entities that have roles/groups, " "but your NLU pipeline does not include a 'DIETClassifier' or a " "'CRFEntityExtractor'. To train entities that have roles/groups, " "add either 'DIETClassifier' or 'CRFEntityExtractor' to your " "pipeline." ) if data.regex_features and not any_components_in_pipeline( ["RegexFeaturizer", "RegexEntityExtractor"], pipeline ): rasa.shared.utils.io.raise_warning( "You have defined training data with regexes, but " "your NLU pipeline does not include a 'RegexFeaturizer' or a " "'RegexEntityExtractor'. To use regexes, include either a " "'RegexFeaturizer' or a 'RegexEntityExtractor' in your pipeline." ) if data.lookup_tables and not any_components_in_pipeline( ["RegexFeaturizer", "RegexEntityExtractor"], pipeline ): rasa.shared.utils.io.raise_warning( "You have defined training data consisting of lookup tables, but " "your NLU pipeline does not include a 'RegexFeaturizer' or a " "'RegexEntityExtractor'. To use lookup tables, include either a " "'RegexFeaturizer' or a 'RegexEntityExtractor' in your pipeline." ) if data.lookup_tables: if not any_components_in_pipeline( ["CRFEntityExtractor", "DIETClassifier"], pipeline ): rasa.shared.utils.io.raise_warning( "You have defined training data consisting of lookup tables, but " "your NLU pipeline does not include any components that use these " "features. To make use of lookup tables, add a 'DIETClassifier' or a " "'CRFEntityExtractor' with the 'pattern' feature to your pipeline." ) elif any_components_in_pipeline(["CRFEntityExtractor"], pipeline): crf_components = [c for c in pipeline if c.name == "CRFEntityExtractor"] # check to see if any of the possible CRFEntityExtractors will # featurize `pattern` has_pattern_feature = False for crf in crf_components: crf_features = crf.component_config.get("features") # iterate through [[before],[word],[after]] features has_pattern_feature = "pattern" in itertools.chain(*crf_features) if not has_pattern_feature: rasa.shared.utils.io.raise_warning( "You have defined training data consisting of lookup tables, but " "your NLU pipeline's 'CRFEntityExtractor' does not include the " "'pattern' feature. To featurize lookup tables, add the 'pattern' " "feature to the 'CRFEntityExtractor' in your pipeline." ) if data.entity_synonyms and not any_components_in_pipeline( ["EntitySynonymMapper"], pipeline ): rasa.shared.utils.io.raise_warning( "You have defined synonyms in your training data, but " "your NLU pipeline does not include an 'EntitySynonymMapper'. " "To map synonyms, add an 'EntitySynonymMapper' to your pipeline." )