def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
def test_convert_featurizer_process(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) featurizer.process(message, tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path, ): additional_size = 3 original_train_text = "hello my name is John." additional_train_text = "I am also new." tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": additional_size }}, finetune_mode=False, ) train_message = Message(data={"text": original_train_text}) data = TrainingData([train_message]) tokenizer.train(data) original_featurizer.train(data) file_dict = original_featurizer.persist("ftr", str(tmp_path)) # load original_featurizer meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load(meta, str(tmp_path), should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) with pytest.warns(UserWarning) as warning: new_featurizer.train(data) assert "New data contains vocabulary of size" in warning[0].message.args[0]
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_train_tokenizer(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(RESPONSE, text) message.set(INTENT, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT]) assert [t.text for t in tokens] == [text]
def test_whitespace_training(supervised_embeddings_config): examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [ {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"} ], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [ {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"} ], }, ), ] tk = WhitespaceTokenizer() tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any" assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican" assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I" assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
async def test_train_persist_load_with_composite_entities( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): importer = RasaFileImporter( training_data_paths=["data/test/demo-rasa-composite-entities.yml"]) training_data = importer.get_nlu_data() tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) crf_extractor = crf_entity_extractor({}) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) tokenizer.process(message) message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractorGraphComponent.load( CRFEntityExtractorGraphComponent.get_default_config(), default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint()
def test_use_shared_vocab_exception( initial_train_text: Text, additional_train_text: Text, use_shared_vocab: bool, tmp_path: Path, ): """Tests if an exception is raised when `use_shared_vocab` is set to True during incremental training.""" tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer( component_config={"use_shared_vocab": use_shared_vocab} ) train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) if use_shared_vocab: with pytest.raises(Exception) as exec_info: new_cvf.train(data) assert ( "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported" in str(exec_info.value) ) else: new_cvf.train(data)
def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", }) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 6 assert message.features[0].origin == "cvf_word" assert message.features[0].type == FEATURE_TYPE_SEQUENCE assert message.features[1].origin == "cvf_word" assert message.features[1].type == FEATURE_TYPE_SENTENCE # cvf word is also extracted for the intent assert message.features[2].origin == "cvf_word" assert message.features[2].type == FEATURE_TYPE_SEQUENCE assert message.features[3].origin == "cvf_char" assert message.features[3].type == FEATURE_TYPE_SEQUENCE assert message.features[4].origin == "cvf_char" assert message.features[4].type == FEATURE_TYPE_SENTENCE assert message.features[5].origin == "LexicalSyntacticFeaturizer" assert message.features[5].type == FEATURE_TYPE_SEQUENCE sequence_feature_dim = (message.features[0].features.shape[1] + message.features[5].features.shape[1]) sentence_feature_dim = message.features[0].features.shape[1] classifier = DIETClassifier(component_config={ FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"] }) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1 assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0 assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == ( 5, sequence_feature_dim, ) assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == ( 1, sentence_feature_dim, ) assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
def test_cvf_incremental_train_vocabulary( additional_size: Optional[int], original_train_text: Text, additional_train_text: Text, total_vocabulary_size: int, remaining_buffer_size: int, tmp_path: Path, ): tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": additional_size }}, finetune_mode=False, ) train_message = Message(data={"text": original_train_text}) data = TrainingData([train_message]) tokenizer.train(data) original_featurizer.train(data) # Check total vocabulary size with buffer slots before finetuning original_vocabulary = original_featurizer.vectorizers["text"].vocabulary_ assert len(original_vocabulary) == total_vocabulary_size file_dict = original_featurizer.persist("ftr", str(tmp_path)) # load original_featurizer meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load(meta, str(tmp_path), should_finetune=True) # Check total vocabulary size with buffer slots before finetuning assert len(new_featurizer.vectorizers["text"].vocabulary_ ) == total_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) new_featurizer.train(data) new_vocabulary = new_featurizer.vectorizers["text"].vocabulary_ # Check total vocabulary size with buffer slots after finetuning assert len(new_vocabulary) == total_vocabulary_size # Check remaining buffer slots after finetuning assert (len(new_vocabulary) - new_featurizer._get_starting_empty_index(new_vocabulary) == remaining_buffer_size) # Check indices of original vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in original_vocabulary.items(): if not vocab_token.startswith("buf_"): assert vocab_token in new_vocabulary assert new_vocabulary.get(vocab_token) == vocab_index
def test_count_vector_featurizer_action_attribute_featurization( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(ACTION_NAME, action_name) train_message.set(ACTION_TEXT, action_text) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(ACTION_TEXT, "hi") second_message.set(ACTION_NAME, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( ACTION_TEXT, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if action_name_features: assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None else: assert action_name_seq_vecs is None assert action_name_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_custom_intent_symbol(text, expected_tokens): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} tk = WhitespaceTokenizer(component_config) message = Message(text) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def train_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() transformer = HFTransformersNLP(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.train(td) transformer.train(td) return messages
def test_model_data_signature_with_entities(messages: List[Message], entity_expected: bool): classifier = DIETClassifier({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_check_check_correct_entity_annotations(text: Text, warnings: int): reader = MarkdownReader() tokenizer = WhitespaceTokenizer() training_data = reader.reads(text) tokenizer.train(training_data) with pytest.warns(UserWarning) as record: EntityExtractor.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all([excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"])
def test_apply_bilou_schema(): tokenizer = WhitespaceTokenizer() message_1 = Message("Germany is part of the European Union") message_1.set( ENTITIES, [ {"start": 0, "end": 7, "value": "Germany", "entity": "location"}, { "start": 23, "end": 37, "value": "European Union", "entity": "organisation", }, ], ) message_2 = Message("Berlin is the capital of Germany") message_2.set( ENTITIES, [ {"start": 0, "end": 6, "value": "Berlin", "entity": "location"}, {"start": 25, "end": 32, "value": "Germany", "entity": "location"}, ], ) training_data = TrainingData([message_1, message_2]) tokenizer.train(training_data) bilou_utils.apply_bilou_schema(training_data) assert message_1.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "B-organisation", "L-organisation", "O", ] assert message_2.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "U-location", "O", ]
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": 0, "response": 0 }}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_whitespace_training(supervised_embeddings_config: RasaNLUModelConfig): examples = [ Message( data={ TEXT: "Any Mexican restaurant will do", "intent": "restaurant_search", "entities": [ {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"} ], } ), Message( data={ TEXT: "I want Tacos!", "intent": "restaurant_search", "entities": [ {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"} ], } ), Message(data={TEXT: "action_restart", "action_name": "action_restart"}), Message( data={ TEXT: "Where are you going?", ACTION_NAME: "Where are you going?", ACTION_TEXT: "Where are you going?", } ), ] component_config = {"case_sensitive": False, "intent_tokenization_flag": True} tk = WhitespaceTokenizer(component_config) tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any" assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican" assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I" assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos" assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[0].text == "action" assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[1].text == "restart" assert examples[2].data.get(TOKENS_NAMES[TEXT])[0].text == "action_restart" assert examples[2].data.get(TOKENS_NAMES[ACTION_TEXT]) is None assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[0].text == "Where" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[1].text == "are" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[2].text == "you" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[3].text == "going"
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(ACTION_NAME, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) # check action_name attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME]) assert [t.text for t in tokens] == [text]
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} ) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", } ) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 4 assert message.features[0].origin == "cvf_word" # cvf word is also extracted for the intent assert message.features[1].origin == "cvf_word" assert message.features[2].origin == "cvf_char" assert message.features[3].origin == "LexicalSyntacticFeaturizer" feature_dim = ( message.features[0].features.shape[1] + message.features[3].features.shape[1] ) classifier = DIETClassifier( component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} ) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get("text_features")) == 1 assert len(model_data.get("label_features")) == 1 assert model_data.get("text_features")[0][0].shape == (6, feature_dim) assert model_data.get("label_features")[0][0].shape == (1, 1)
def test_whitespace_training(supervised_embeddings_config): examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }, ), ] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get("tokens")[0].text == "any" assert examples[0].data.get("tokens")[1].text == "mexican" assert examples[0].data.get("tokens")[2].text == "restaurant" assert examples[0].data.get("tokens")[3].text == "will" assert examples[0].data.get("tokens")[4].text == "do" assert examples[1].data.get("tokens")[0].text == "i" assert examples[1].data.get("tokens")[1].text == "want" assert examples[1].data.get("tokens")[2].text == "tacos"
def test_lm_featurizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): config = { "model_name": "bert", "model_weights": "bert-base-uncased", } # Test for one should be enough lm_featurizer = LanguageModelFeaturizer(config) whitespace_tokenizer = WhitespaceTokenizer() message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.train(td) lm_featurizer.train(td) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) ] == expected_number_of_sub_tokens
def test_convert_featurizer_tokens_to_text(sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "use_shared_vocab": True, "additional_vocabulary_size": { "text": 0, "response": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_cvf_incremental_training( initial_train_text: Text, additional_train_text: Text, initial_vocabulary_size: int, final_vocabulary_size: int, tmp_path: Path, ): tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer() train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) # Check initial vocabulary size initial_vocab = initial_cvf.vectorizers["text"].vocabulary_ assert len(initial_vocab) == initial_vocabulary_size # persist and load initial cvf file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) # Check vocabulary size again assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) new_cvf.train(data) new_vocab = new_cvf.vectorizers["text"].vocabulary_ # Check vocabulary size after finetuning assert len(new_vocab) == final_vocabulary_size # Check indices of initial vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in initial_vocab.items(): assert vocab_token in new_vocab assert new_vocab.get(vocab_token) == vocab_index
def test_train_tokenizer_e2e_actions(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(ACTION_TEXT, text) message.set(ACTION_NAME, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [ACTION_TEXT, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]