def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": 0, "response": 0 }}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vectors_featurizer_train( create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): featurizer = create_featurizer() sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) data = TrainingData([message]) featurizer.train(data) featurizer.process_training_data(data) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create( {"additional_vocabulary_size": { "text": 0, "response": 0 }}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_count_vector_featurizer_char( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", }) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected)
def print_message(message: Message) -> None: features = {**message.as_dict_nlu()} seq_vecs, sen_vecs = message.get_dense_features(TEXT) features["dense"] = { "sequence": None if not seq_vecs else dense_message(seq_vecs.features), "sentence": None if not sen_vecs else dense_message(sen_vecs.features), } seq_vecs, sen_vecs = message.get_sparse_features(TEXT) features["sparse"] = { "sequence": None if not seq_vecs else sparse_message(seq_vecs.features), "sentence": None if not sen_vecs else sparse_message(sen_vecs.features), } if "text_tokens" in features.keys(): features["text_tokens"] = [t.text for t in features["text_tokens"]] if "intent" in features.keys(): features["intent"] = { k: v for k, v in features["intent"].items() if "id" != k } if "intent_ranking" in features.keys(): features["intent_ranking"] = [{ k: v for k, v in i.items() if "id" != k } for i in features["intent_ranking"]] if "diagnostic_data" in features.keys(): features["diagnostic_data"] = { name: {k: dense_message(v) for k, v in comp.items()} for name, comp in features["diagnostic_data"].items() } print(features)
def test_get_sparse_features( features: Optional[List[Features]], attribute: Text, featurizers: List[Text], expected_seq_features: Optional[List[Features]], expected_sen_features: Optional[List[Features]], ): message = Message(data={TEXT: "This is a test sentence."}, features=features) actual_seq_features, actual_sen_features = message.get_sparse_features( attribute, featurizers) if actual_seq_features: actual_seq_features = actual_seq_features.features if actual_sen_features: actual_sen_features = actual_sen_features.features if expected_seq_features is None: assert actual_seq_features is None else: assert actual_seq_features is not None assert np.all(actual_seq_features.toarray() == expected_seq_features) if expected_sen_features is None: assert actual_sen_features is None else: assert actual_sen_features is not None assert np.all(actual_sen_features.toarray() == expected_sen_features)
def test_count_vector_featurizer( sentence: Text, expected: List[List[int]], expected_cls: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) whitespace_tokenizer.process([test_message]) ftr.train(TrainingData([train_message])) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_text_featurizer(sentence, expected_features): featurizer = LexicalSyntacticFeaturizer({ "features": [ ["BOS", "upper"], ["BOS", "EOS", "prefix2", "digit"], ["EOS", "low"], ] }) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected_features[:-1])
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message(data={TEXT: ""}) train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: ""}) test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(test_message) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_attribute_featurization( sentence: Text, intent: Text, response: Optional[Text], intent_features: List[List[int]], response_features: Optional[List[List[int]]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def fetch_sparse_features(txt, tokenizer, featurizer): message = Message( {TEXT: "my advices include to give advice and giving many greetings"}) tokenizer.process(message) featurizer.train(TrainingData([message])) featurizer.process(message) seq_vecs, sen_vecs = message.get_sparse_features(TEXT, []) return seq_vecs.features.toarray()
def test_count_vector_featurizer_shared_vocab( sentence: Text, intent: Text, response: Text, text_features: List[List[int]], intent_features: List[List[int]], response_features: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "use_shared_vocab": True, }) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "use_shared_vocab": True, "additional_vocabulary_size": { "text": 0, "response": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features ): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def _get_sentence_features(message: Message) -> scipy.sparse.spmatrix: _, dense_sentence_features = message.get_dense_features(TEXT) if dense_sentence_features is not None: rasa.shared.utils.io.raise_warning( "Dense features are being computed but not used in " "the SparseNaiveBayesIntentClassifier.") _, sentence_features = message.get_sparse_features(TEXT) if sentence_features is not None: return sentence_features.features raise ValueError("No sparse sentence features present. " "Not able to train sklearn intent classifier.")
def test_text_featurizer_using_pos_with_action_text(sentence: Text, expected: np.ndarray, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(data={TEXT: sentence, ACTION_TEXT: sentence}) test_message = Message(data={TEXT: sentence, ACTION_TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) train_message.set(SPACY_DOCS[ACTION_TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[ACTION_TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) # Checking that text is processed as expected featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected) # Checking that action_text does not get processed and passing attribute works featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(ACTION_TEXT, []) assert seq_vec is None assert sen_vec is None
def fetch_sparse_features(txt, tokenizer, featurizer): message = Message({TEXT: txt}) tokenizer.process(message) featurizer.train(TrainingData([message])) featurizer.process(message) seq_vecs, sen_vecs = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features return seq_vecs.toarray()
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "token_pattern": r"(?u)\b\w+\b", }) # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) whitespace_tokenizer.process_training_data(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) whitespace_tokenizer.process([test_message]) ftr.process([test_message]) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], spacy_tokenizer: SpacyTokenizer, ): config = { "use_lemma": use_lemma, "OOV_words": ["drinks"], "OOV_token": "OOV" } ftr = create_featurizer(config) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([train_message]) spacy_tokenizer.process([test_message]) ftr.train(TrainingData([train_message]), model=SpacyModel(spacy_nlp, "en")) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features) loaded = load_featurizer(config) assert loaded.OOV_words == ftr.OOV_words
def test_count_vector_featurizer_oov_token( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer({"OOV_token": "__oov__"}) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_incremental_train_featurization(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) # Test featurization of message expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) new_patterns = [ { "pattern": "\\btoday*", "name": "day", "usage": "intent" }, { "pattern": "\\bhey+", "name": "hello", "usage": "intent" }, ] message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) loaded_featurizer.train( TrainingData([message], regex_features=patterns + new_patterns), RasaNLUModelConfig(), ) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[-2] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 0}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_count_vector_featurizer_persist_load(tmp_path: Path): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(data={TEXT: sentence1}) train_message2 = Message(data={TEXT: sentence2}) WhitespaceTokenizer().process(train_message1) WhitespaceTokenizer().process(train_message2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", str(tmp_path)) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_} ) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path), finetune_mode=False) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(data={TEXT: sentence1}) WhitespaceTokenizer().process(test_message1) test_ftr.process(test_message1) test_message2 = Message(data={TEXT: sentence2}) WhitespaceTokenizer().process(test_message2) test_ftr.process(test_message2) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, []) if test_seq_vec_1: test_seq_vec_1 = test_seq_vec_1.features if test_sen_vec_1: test_sen_vec_1 = test_sen_vec_1.features train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, []) if train_seq_vec_1: train_seq_vec_1 = train_seq_vec_1.features if train_sen_vec_1: train_sen_vec_1 = train_sen_vec_1.features test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, []) if test_seq_vec_2: test_seq_vec_2 = test_seq_vec_2.features if test_sen_vec_2: test_sen_vec_2 = test_sen_vec_2.features train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, []) if train_seq_vec_2: train_seq_vec_2 = train_seq_vec_2.features if train_sen_vec_2: train_sen_vec_2 = train_sen_vec_2.features # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())