def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": 0, "response": 0 }}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) WhitespaceTokenizer().process(test_message) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) WhitespaceTokenizer().process(test_message) ftr.process(test_message) vec = train_message.get_sparse_features(TEXT, []) assert np.all(vec.toarray()[0] == expected)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "use_shared_vocab": True, "additional_vocabulary_size": { "text": 0, "response": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_cvf_incremental_training( initial_train_text: Text, additional_train_text: Text, initial_vocabulary_size: int, final_vocabulary_size: int, tmp_path: Path, ): tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer() train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) # Check initial vocabulary size initial_vocab = initial_cvf.vectorizers["text"].vocabulary_ assert len(initial_vocab) == initial_vocabulary_size # persist and load initial cvf file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) # Check vocabulary size again assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) new_cvf.train(data) new_vocab = new_cvf.vectorizers["text"].vocabulary_ # Check vocabulary size after finetuning assert len(new_vocab) == final_vocabulary_size # Check indices of initial vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in initial_vocab.items(): assert vocab_token in new_vocab assert new_vocab.get(vocab_token) == vocab_index
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char",}) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(test_message) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual[0] == expected) assert np.all(actual[-1] == expected_cls)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "return_sequence": True, }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_cvf_independent_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { TEXT: additional_size, RESPONSE: additional_size, ACTION_TEXT: additional_size, } }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) for attribute in [TEXT, RESPONSE, ACTION_TEXT]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(attribute_vocabulary) == real_vocabulary_size) for attribute in [INTENT, ACTION_NAME]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == 1
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(vecs, scipy.sparse.coo_matrix) actual_vecs = vecs.toarray() assert np.all(actual_vecs[0] == expected) assert np.all(actual_vecs[-1] == expected_cls)
def test_count_vector_featurizer_char(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "return_sequence": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",}) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, [] ) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vector_featurizer_no_sequence(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": False }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix) actual = test_message.get("text_sparse_features").toarray() assert np.all(actual == expected)
def test_cvf_shared_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { "text": additional_size, "response": additional_size, "action_text": additional_size, }, "use_shared_vocab": True, }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) shared_vocabulary = featurizer.vectorizers["text"].vocabulary_ assert len(shared_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(shared_vocabulary) == real_vocabulary_size)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT_ATTRIBUTE, intent) train_message.set(RESPONSE_ATTRIBUTE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE_ATTRIBUTE, "hi") second_message.set(INTENT_ATTRIBUTE, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0] == intent_features) else: assert train_message.get( SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0] == response_features) else: assert train_message.get( SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({ "use_lemma": use_lemma, "additional_vocabulary_size": { "text": 0 } }) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) vec = train_message.get_sparse_features(TEXT, []) assert np.all(vec.toarray()[0] == text_features) vec = train_message.get_sparse_features(INTENT, []) assert np.all(vec.toarray()[0] == intent_features) vec = train_message.get_sparse_features(RESPONSE, []) assert np.all(vec.toarray()[0] == response_features)
def test_count_vector_featurizer_persist_load(tmp_path): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", str(tmp_path)) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path)) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features( TEXT, []) train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features( TEXT, []) test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features( TEXT, []) train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features( TEXT, []) # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
def test_count_vector_featurizer_persist_load(tmpdir): # set non default values to config config = { "analyzer": "char", "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", tmpdir.strpath) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) # check that train features and test features after loading are the same assert np.all([ train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() == test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(), train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() == test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(), ])