def test_mitie_featurizer( create: Callable[[Dict[Text, Any]], MitieFeaturizer], mitie_model: MitieModel, mitie_tokenizer: MitieTokenizer, ): featurizer = create({"alias": "mitie_featurizer"}) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) mitie_tokenizer.process([message]) tokens = message.get(TOKENS_NAMES[TEXT]) seq_vec, sen_vec = featurizer.features_for_tokens( tokens, mitie_model.word_feature_extractor) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert 6 == len(seq_vec) + len(sen_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5)
def test_train_load_predict_loop( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, mitie_model: MitieModel, mitie_tokenizer: MitieTokenizer, ): resource = Resource("mitie_classifier") component = MitieIntentClassifier.create( MitieIntentClassifier.get_default_config(), default_model_storage, resource, default_execution_context, ) training_data = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.yml") # Tokenize message as classifier needs that mitie_tokenizer.process_training_data(training_data) component.train(training_data, mitie_model) component = MitieIntentClassifier.load( MitieIntentClassifier.get_default_config(), default_model_storage, resource, default_execution_context, ) test_message = Message({TEXT: "hi"}) mitie_tokenizer.process([test_message]) component.process([test_message], mitie_model) assert test_message.data[INTENT][INTENT_NAME_KEY] == "greet" assert test_message.data[INTENT][PREDICTED_CONFIDENCE_KEY] > 0
def test_mitie(text, expected_tokens, expected_indices): tk = MitieTokenizer() tokens = tk.tokenize(Message(text), attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def inner(config: Dict[Text, Any]) -> MitieTokenizer: return MitieTokenizer.create( { **MitieTokenizer.get_default_config(), **config }, default_model_storage, Resource("mitie_tokenizer"), default_execution_context, )
def test_mitie_featurizer_train( create: Callable[[Dict[Text, Any]], MitieFeaturizer], mitie_model: MitieModel, mitie_tokenizer: MitieTokenizer, ): featurizer = create({"alias": "mitie_featurizer"}) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") mitie_tokenizer.process_training_data(TrainingData([message])) featurizer.process_training_data(TrainingData([message]), mitie_model) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert seq_vec is None assert sen_vec is None
def test_mitie_add_cls_token(): from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer component_config = {"use_cls_token": True} tk = MitieTokenizer(component_config) text = "Forecast for lunch" assert [t.text for t in tk.tokenize(text)] == [ "Forecast", "for", "lunch", CLS_TOKEN, ] assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
def test_custom_intent_symbol(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } tk = MitieTokenizer(component_config) message = Message(text) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer mitie_component_config = {"name": "MitieFeaturizer"} ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig()) sentence = "Hey how are you today" tokens = MitieTokenizer().tokenize(sentence) vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_load_from_untrained( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, mitie_model: MitieModel, mitie_tokenizer: MitieTokenizer, ): resource = Resource("some_resource") component = MitieIntentClassifier.load( MitieIntentClassifier.get_default_config(), default_model_storage, resource, default_execution_context, ) test_message = Message({TEXT: "hi"}) mitie_tokenizer.process([test_message]) component.process([test_message], mitie_model) assert test_message.data[INTENT] == {"name": None, "confidence": 0.0}
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert seq_vec is None assert sen_vec is None
def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config): from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer component_config = {"name": "MitieFeaturizer", "return_sequence": False} featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig()) sentence = f"Hey how are you today {CLS_TOKEN}" tokens = MitieTokenizer().tokenize(sentence) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0] expected = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_load_from_untrained_but_with_resource_existing( default_model_storage: ModelStorage, default_execution_context: ExecutionContext, mitie_model: MitieModel, mitie_tokenizer: MitieTokenizer, ): resource = Resource("some_resource") with default_model_storage.write_to(resource): # This makes sure the directory exists but the model file itself doesn't pass component = MitieIntentClassifier.load( MitieIntentClassifier.get_default_config(), default_model_storage, resource, default_execution_context, ) test_message = Message({TEXT: "hi"}) mitie_tokenizer.process([test_message]) component.process([test_message], mitie_model) assert test_message.data[INTENT] == {"name": None, "confidence": 0.0}
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer component_config = {"name": "MitieFeaturizer", "return_sequence": True} featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig()) sentence = f"Hey how are you today {CLS_TOKEN}" tokens = MitieTokenizer().tokenize(sentence) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0] expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT])[:-1] # remove CLS token vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array( [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00] ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_mitie(): from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer tk = MitieTokenizer() text = "Forecast for lunch" assert [t.text for t in tk.tokenize(text)] == \ ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize(text)] == \ [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(text)] == \ ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [t.offset for t in tk.tokenize(text)] == \ [0, 4, 13, 16, 20, 23]
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_mitie(): from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer tk = MitieTokenizer() text = "Forecast for lunch" assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"] assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(text)] == [ "hey", "ńöñàśçií", "how", "'re", "you", "?", ] assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
from rasa.nlu.constants import (TEXT, SPACY_DOCS) logger = logging_setup() test_input = "Okay, pick up this yellow banana for me." message = Message(test_input) tk = WhitespaceTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('Whitespace: {}'.format([t.text for t in tokens])) tk = SpacyTokenizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input)) tokens = tk.tokenize(message, attribute=TEXT) logger.info('SpaCy: {}'.format([t.text for t in tokens])) tk = MitieTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('Mitie: {}'.format([t.text for t in tokens])) tk = ConveRTTokenizer() tokens = tk.tokenize(message, attribute=TEXT) logger.info('ConveRT: {}'.format([t.text for t in tokens])) tk = LanguageModelTokenizer() transformers_nlp = HFTransformersNLP({"model_name": "bert"}) transformers_nlp.process(message) tokens = tk.tokenize(message, attribute=TEXT) logger.info('BERT: {}'.format([t.text for t in tokens]))
def mitie_tokenizer() -> MitieTokenizer: return MitieTokenizer(MitieTokenizer.get_default_config())
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import CountVectorsFeaturizer from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, SPARSE_FEATURE_NAMES logger = logging_setup() featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) test_input = "Okay, pick up this yellow banana for me." message = Message(test_input) message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input)) featurizer._set_spacy_features(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("SpaCy: {}".format(vecs.shape)) message = Message(test_input) featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT]) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) logger.info("Mitie: {}".format(vecs.shape)) featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("ConveRT: {}".format(vecs.shape))